xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 2dfe29262b4468d5fbb057f165946753e529c684)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
7 
8 #include <petscconf.h>
9 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
10 #include <../src/mat/impls/sbaij/seq/sbaij.h>
11 #include <../src/vec/vec/impls/dvecimpl.h>
12 #include <petsc/private/vecimpl.h>
13 #undef VecType
14 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
15 #include <thrust/async/for_each.h>
16 
17 const char *const MatCUSPARSEStorageFormats[]    = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
18 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
19   /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
20     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
21 
22   typedef enum {
23       CUSPARSE_MV_ALG_DEFAULT = 0,
24       CUSPARSE_COOMV_ALG      = 1,
25       CUSPARSE_CSRMV_ALG1     = 2,
26       CUSPARSE_CSRMV_ALG2     = 3
27   } cusparseSpMVAlg_t;
28 
29   typedef enum {
30       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
31       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
32       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
33       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
34       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
35       CUSPARSE_SPMM_ALG_DEFAULT = 0,
36       CUSPARSE_SPMM_COO_ALG1    = 1,
37       CUSPARSE_SPMM_COO_ALG2    = 2,
38       CUSPARSE_SPMM_COO_ALG3    = 3,
39       CUSPARSE_SPMM_COO_ALG4    = 5,
40       CUSPARSE_SPMM_CSR_ALG1    = 4,
41       CUSPARSE_SPMM_CSR_ALG2    = 6,
42   } cusparseSpMMAlg_t;
43 
44   typedef enum {
45       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
46       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
47   } cusparseCsr2CscAlg_t;
48   */
49   const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
50   const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
51   const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
52 #endif
53 
54 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
55 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
56 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
57 
58 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
59 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
60 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
61 
62 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
63 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
64 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
65 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
66 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
67 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure);
68 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar);
69 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
70 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
71 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
72 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
73 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
74 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
75 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
76 
77 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
78 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
79 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
80 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
81 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
82 
83 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat);
84 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
85 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool);
86 
87 PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]);
88 PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode);
89 
90 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]);
91 
92 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
93 {
94   cusparseStatus_t   stat;
95   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
96 
97   PetscFunctionBegin;
98   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
99   cusparsestruct->stream = stream;
100   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
101   PetscFunctionReturn(0);
102 }
103 
104 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
105 {
106   cusparseStatus_t   stat;
107   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
108 
109   PetscFunctionBegin;
110   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
111   if (cusparsestruct->handle != handle) {
112     if (cusparsestruct->handle) {
113       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
114     }
115     cusparsestruct->handle = handle;
116   }
117   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
118   PetscFunctionReturn(0);
119 }
120 
121 PetscErrorCode MatCUSPARSEClearHandle(Mat A)
122 {
123   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
124   PetscBool          flg;
125   PetscErrorCode     ierr;
126 
127   PetscFunctionBegin;
128   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
129   if (!flg || !cusparsestruct) PetscFunctionReturn(0);
130   if (cusparsestruct->handle) cusparsestruct->handle = 0;
131   PetscFunctionReturn(0);
132 }
133 
134 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
135 {
136   PetscFunctionBegin;
137   *type = MATSOLVERCUSPARSE;
138   PetscFunctionReturn(0);
139 }
140 
141 /*MC
142   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
143   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
144   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
145   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
146   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
147   algorithms are not recommended. This class does NOT support direct solver operations.
148 
149   Level: beginner
150 
151 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
152 M*/
153 
154 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
155 {
156   PetscErrorCode ierr;
157   PetscInt       n = A->rmap->n;
158 
159   PetscFunctionBegin;
160   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
161   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
162   (*B)->factortype = ftype;
163   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
164 
165   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
166     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
167     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
168     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
169     ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr);
170     ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU]);CHKERRQ(ierr);
171     ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT]);CHKERRQ(ierr);
172   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
173     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
174     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
175     ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]);CHKERRQ(ierr);
176     ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC]);CHKERRQ(ierr);
177   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
178 
179   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
180   (*B)->canuseordering = PETSC_TRUE;
181   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
182   PetscFunctionReturn(0);
183 }
184 
185 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
186 {
187   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
188 
189   PetscFunctionBegin;
190   switch (op) {
191   case MAT_CUSPARSE_MULT:
192     cusparsestruct->format = format;
193     break;
194   case MAT_CUSPARSE_ALL:
195     cusparsestruct->format = format;
196     break;
197   default:
198     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
199   }
200   PetscFunctionReturn(0);
201 }
202 
203 /*@
204    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
205    operation. Only the MatMult operation can use different GPU storage formats
206    for MPIAIJCUSPARSE matrices.
207    Not Collective
208 
209    Input Parameters:
210 +  A - Matrix of type SEQAIJCUSPARSE
211 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
212 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
213 
214    Output Parameter:
215 
216    Level: intermediate
217 
218 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
219 @*/
220 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
221 {
222   PetscErrorCode ierr;
223 
224   PetscFunctionBegin;
225   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
226   ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
227   PetscFunctionReturn(0);
228 }
229 
230 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg)
231 {
232   PetscErrorCode ierr;
233 
234   PetscFunctionBegin;
235   switch (op) {
236     case MAT_FORM_EXPLICIT_TRANSPOSE:
237       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
238       if (A->form_explicit_transpose && !flg) {ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);}
239       A->form_explicit_transpose = flg;
240       break;
241     default:
242       ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr);
243       break;
244   }
245   PetscFunctionReturn(0);
246 }
247 
248 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A);
249 
250 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
251 {
252   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
253   IS             isrow = b->row,iscol = b->col;
254   PetscBool      row_identity,col_identity;
255   PetscErrorCode ierr;
256 
257   PetscFunctionBegin;
258   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
259   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
260   B->offloadmask = PETSC_OFFLOAD_CPU;
261   /* determine which version of MatSolve needs to be used. */
262   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
263   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
264   if (row_identity && col_identity) {
265     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
266     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
267     B->ops->matsolve = NULL;
268     B->ops->matsolvetranspose = NULL;
269   } else {
270     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
271     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
272     B->ops->matsolve = NULL;
273     B->ops->matsolvetranspose = NULL;
274   }
275 
276   /* get the triangular factors */
277   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
278   PetscFunctionReturn(0);
279 }
280 
281 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
282 {
283   PetscErrorCode           ierr;
284   MatCUSPARSEStorageFormat format;
285   PetscBool                flg;
286   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
287 
288   PetscFunctionBegin;
289   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
290   if (A->factortype == MAT_FACTOR_NONE) {
291     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
292                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
293     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);}
294 
295     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
296                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
297     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);}
298    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
299     ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
300                             "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr);
301     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
302 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
303     if (flg && CUSPARSE_SPMV_CSR_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
304 #else
305     if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
306 #endif
307     ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
308                             "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr);
309     if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
310 
311     ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
312                             "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr);
313     if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
314    #endif
315   }
316   ierr = PetscOptionsTail();CHKERRQ(ierr);
317   PetscFunctionReturn(0);
318 }
319 
320 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
321 {
322   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
323   PetscErrorCode               ierr;
324 
325   PetscFunctionBegin;
326   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
327   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
328   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
329   PetscFunctionReturn(0);
330 }
331 
332 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
333 {
334   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
335   PetscErrorCode               ierr;
336 
337   PetscFunctionBegin;
338   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
339   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
340   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
341   PetscFunctionReturn(0);
342 }
343 
344 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
345 {
346   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
347   PetscErrorCode               ierr;
348 
349   PetscFunctionBegin;
350   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
351   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
352   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
353   PetscFunctionReturn(0);
354 }
355 
356 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
357 {
358   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
359   PetscErrorCode               ierr;
360 
361   PetscFunctionBegin;
362   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
363   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
364   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
365   PetscFunctionReturn(0);
366 }
367 
368 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
369 {
370   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
371   PetscInt                          n = A->rmap->n;
372   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
373   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
374   cusparseStatus_t                  stat;
375   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
376   const MatScalar                   *aa = a->a,*v;
377   PetscInt                          *AiLo, *AjLo;
378   PetscInt                          i,nz, nzLower, offset, rowOffset;
379   PetscErrorCode                    ierr;
380   cudaError_t                       cerr;
381 
382   PetscFunctionBegin;
383   if (!n) PetscFunctionReturn(0);
384   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
385     try {
386       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
387       nzLower=n+ai[n]-ai[1];
388       if (!loTriFactor) {
389         PetscScalar                       *AALo;
390 
391         cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
392 
393         /* Allocate Space for the lower triangular matrix */
394         cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
395         cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);
396 
397         /* Fill the lower triangular matrix */
398         AiLo[0]  = (PetscInt) 0;
399         AiLo[n]  = nzLower;
400         AjLo[0]  = (PetscInt) 0;
401         AALo[0]  = (MatScalar) 1.0;
402         v        = aa;
403         vi       = aj;
404         offset   = 1;
405         rowOffset= 1;
406         for (i=1; i<n; i++) {
407           nz = ai[i+1] - ai[i];
408           /* additional 1 for the term on the diagonal */
409           AiLo[i]    = rowOffset;
410           rowOffset += nz+1;
411 
412           ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
413           ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
414 
415           offset      += nz;
416           AjLo[offset] = (PetscInt) i;
417           AALo[offset] = (MatScalar) 1.0;
418           offset      += 1;
419 
420           v  += nz;
421           vi += nz;
422         }
423 
424         /* allocate space for the triangular factor information */
425         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
426         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
427         /* Create the matrix description */
428         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
429         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
430        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
431         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
432        #else
433         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
434        #endif
435         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
436         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
437 
438         /* set the operation */
439         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
440 
441         /* set the matrix */
442         loTriFactor->csrMat = new CsrMatrix;
443         loTriFactor->csrMat->num_rows = n;
444         loTriFactor->csrMat->num_cols = n;
445         loTriFactor->csrMat->num_entries = nzLower;
446 
447         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
448         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
449 
450         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
451         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
452 
453         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
454         loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
455 
456         /* Create the solve analysis information */
457         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
458         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
459       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
460         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
461                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
462                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
463                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
464                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
465         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
466       #endif
467 
468         /* perform the solve analysis */
469         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
470                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
471                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
472                                  loTriFactor->csrMat->column_indices->data().get(),
473                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
474                                  loTriFactor->solveInfo,
475                                  loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
476                                #else
477                                  loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
478                                #endif
479         cerr = WaitForCUDA();CHKERRCUDA(cerr);
480         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
481 
482         /* assign the pointer */
483         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
484         loTriFactor->AA_h = AALo;
485         cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
486         cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
487         ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
488       } else { /* update values only */
489         if (!loTriFactor->AA_h) {
490           cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
491         }
492         /* Fill the lower triangular matrix */
493         loTriFactor->AA_h[0]  = 1.0;
494         v        = aa;
495         vi       = aj;
496         offset   = 1;
497         for (i=1; i<n; i++) {
498           nz = ai[i+1] - ai[i];
499           ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr);
500           offset      += nz;
501           loTriFactor->AA_h[offset] = 1.0;
502           offset      += 1;
503           v  += nz;
504         }
505         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower);
506         ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
507       }
508     } catch(char *ex) {
509       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
510     }
511   }
512   PetscFunctionReturn(0);
513 }
514 
515 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
516 {
517   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
518   PetscInt                          n = A->rmap->n;
519   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
520   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
521   cusparseStatus_t                  stat;
522   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
523   const MatScalar                   *aa = a->a,*v;
524   PetscInt                          *AiUp, *AjUp;
525   PetscInt                          i,nz, nzUpper, offset;
526   PetscErrorCode                    ierr;
527   cudaError_t                       cerr;
528 
529   PetscFunctionBegin;
530   if (!n) PetscFunctionReturn(0);
531   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
532     try {
533       /* next, figure out the number of nonzeros in the upper triangular matrix. */
534       nzUpper = adiag[0]-adiag[n];
535       if (!upTriFactor) {
536         PetscScalar *AAUp;
537 
538         cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
539 
540         /* Allocate Space for the upper triangular matrix */
541         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
542         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
543 
544         /* Fill the upper triangular matrix */
545         AiUp[0]=(PetscInt) 0;
546         AiUp[n]=nzUpper;
547         offset = nzUpper;
548         for (i=n-1; i>=0; i--) {
549           v  = aa + adiag[i+1] + 1;
550           vi = aj + adiag[i+1] + 1;
551 
552           /* number of elements NOT on the diagonal */
553           nz = adiag[i] - adiag[i+1]-1;
554 
555           /* decrement the offset */
556           offset -= (nz+1);
557 
558           /* first, set the diagonal elements */
559           AjUp[offset] = (PetscInt) i;
560           AAUp[offset] = (MatScalar)1./v[nz];
561           AiUp[i]      = AiUp[i+1] - (nz+1);
562 
563           ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
564           ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
565         }
566 
567         /* allocate space for the triangular factor information */
568         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
569         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
570 
571         /* Create the matrix description */
572         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
573         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
574        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
575         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
576        #else
577         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
578        #endif
579         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
580         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
581 
582         /* set the operation */
583         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
584 
585         /* set the matrix */
586         upTriFactor->csrMat = new CsrMatrix;
587         upTriFactor->csrMat->num_rows = n;
588         upTriFactor->csrMat->num_cols = n;
589         upTriFactor->csrMat->num_entries = nzUpper;
590 
591         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
592         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
593 
594         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
595         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
596 
597         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
598         upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
599 
600         /* Create the solve analysis information */
601         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
602         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
603       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
604         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
605                                      upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
606                                      upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
607                                      upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
608                                      &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
609         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
610       #endif
611 
612         /* perform the solve analysis */
613         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
614                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
615                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
616                                  upTriFactor->csrMat->column_indices->data().get(),
617                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
618                                  upTriFactor->solveInfo,
619                                  upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
620                                #else
621                                  upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
622                                #endif
623         cerr = WaitForCUDA();CHKERRCUDA(cerr);
624         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
625 
626         /* assign the pointer */
627         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
628         upTriFactor->AA_h = AAUp;
629         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
630         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
631         ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
632       } else {
633         if (!upTriFactor->AA_h) {
634           cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
635         }
636         /* Fill the upper triangular matrix */
637         offset = nzUpper;
638         for (i=n-1; i>=0; i--) {
639           v  = aa + adiag[i+1] + 1;
640 
641           /* number of elements NOT on the diagonal */
642           nz = adiag[i] - adiag[i+1]-1;
643 
644           /* decrement the offset */
645           offset -= (nz+1);
646 
647           /* first, set the diagonal elements */
648           upTriFactor->AA_h[offset] = 1./v[nz];
649           ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr);
650         }
651         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper);
652         ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
653       }
654     } catch(char *ex) {
655       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
656     }
657   }
658   PetscFunctionReturn(0);
659 }
660 
661 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
662 {
663   PetscErrorCode               ierr;
664   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
665   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
666   IS                           isrow = a->row,iscol = a->icol;
667   PetscBool                    row_identity,col_identity;
668   PetscInt                     n = A->rmap->n;
669 
670   PetscFunctionBegin;
671   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
672   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
673   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
674 
675   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
676   cusparseTriFactors->nnz=a->nz;
677 
678   A->offloadmask = PETSC_OFFLOAD_BOTH;
679   /* lower triangular indices */
680   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
681   if (!row_identity && !cusparseTriFactors->rpermIndices) {
682     const PetscInt *r;
683 
684     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
685     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
686     cusparseTriFactors->rpermIndices->assign(r, r+n);
687     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
688     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
689   }
690 
691   /* upper triangular indices */
692   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
693   if (!col_identity && !cusparseTriFactors->cpermIndices) {
694     const PetscInt *c;
695 
696     ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
697     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
698     cusparseTriFactors->cpermIndices->assign(c, c+n);
699     ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
700     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
701   }
702   PetscFunctionReturn(0);
703 }
704 
705 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
706 {
707   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
708   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
709   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
710   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
711   cusparseStatus_t                  stat;
712   PetscErrorCode                    ierr;
713   cudaError_t                       cerr;
714   PetscInt                          *AiUp, *AjUp;
715   PetscScalar                       *AAUp;
716   PetscScalar                       *AALo;
717   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
718   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
719   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
720   const MatScalar                   *aa = b->a,*v;
721 
722   PetscFunctionBegin;
723   if (!n) PetscFunctionReturn(0);
724   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
725     try {
726       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
727       cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
728       if (!upTriFactor && !loTriFactor) {
729         /* Allocate Space for the upper triangular matrix */
730         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
731         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
732 
733         /* Fill the upper triangular matrix */
734         AiUp[0]=(PetscInt) 0;
735         AiUp[n]=nzUpper;
736         offset = 0;
737         for (i=0; i<n; i++) {
738           /* set the pointers */
739           v  = aa + ai[i];
740           vj = aj + ai[i];
741           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
742 
743           /* first, set the diagonal elements */
744           AjUp[offset] = (PetscInt) i;
745           AAUp[offset] = (MatScalar)1.0/v[nz];
746           AiUp[i]      = offset;
747           AALo[offset] = (MatScalar)1.0/v[nz];
748 
749           offset+=1;
750           if (nz>0) {
751             ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
752             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
753             for (j=offset; j<offset+nz; j++) {
754               AAUp[j] = -AAUp[j];
755               AALo[j] = AAUp[j]/v[nz];
756             }
757             offset+=nz;
758           }
759         }
760 
761         /* allocate space for the triangular factor information */
762         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
763         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
764 
765         /* Create the matrix description */
766         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
767         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
768        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
769         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
770        #else
771         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
772        #endif
773         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
774         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
775 
776         /* set the matrix */
777         upTriFactor->csrMat = new CsrMatrix;
778         upTriFactor->csrMat->num_rows = A->rmap->n;
779         upTriFactor->csrMat->num_cols = A->cmap->n;
780         upTriFactor->csrMat->num_entries = a->nz;
781 
782         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
783         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
784 
785         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
786         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
787 
788         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
789         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
790 
791         /* set the operation */
792         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
793 
794         /* Create the solve analysis information */
795         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
796         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
797       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
798         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
799                                        upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
800                                        upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
801                                        upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
802                                        &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
803         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
804       #endif
805 
806         /* perform the solve analysis */
807         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
808                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
809                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
810                                  upTriFactor->csrMat->column_indices->data().get(),
811                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
812                                  upTriFactor->solveInfo,
813                                  upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
814                                 #else
815                                   upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
816                                 #endif
817         cerr = WaitForCUDA();CHKERRCUDA(cerr);
818         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
819 
820         /* assign the pointer */
821         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
822 
823         /* allocate space for the triangular factor information */
824         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
825         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
826 
827         /* Create the matrix description */
828         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
829         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
830        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
831         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
832        #else
833         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
834        #endif
835         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
836         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
837 
838         /* set the operation */
839         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
840 
841         /* set the matrix */
842         loTriFactor->csrMat = new CsrMatrix;
843         loTriFactor->csrMat->num_rows = A->rmap->n;
844         loTriFactor->csrMat->num_cols = A->cmap->n;
845         loTriFactor->csrMat->num_entries = a->nz;
846 
847         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
848         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
849 
850         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
851         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
852 
853         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
854         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
855 
856         /* Create the solve analysis information */
857         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
858         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
859       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
860         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
861                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
862                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
863                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
864                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
865         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
866       #endif
867 
868         /* perform the solve analysis */
869         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
870                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
871                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
872                                  loTriFactor->csrMat->column_indices->data().get(),
873                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
874                                  loTriFactor->solveInfo,
875                                  loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
876                                 #else
877                                  loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
878                                 #endif
879         cerr = WaitForCUDA();CHKERRCUDA(cerr);
880         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
881 
882         /* assign the pointer */
883         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
884 
885         ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
886         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
887         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
888       } else {
889         /* Fill the upper triangular matrix */
890         offset = 0;
891         for (i=0; i<n; i++) {
892           /* set the pointers */
893           v  = aa + ai[i];
894           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
895 
896           /* first, set the diagonal elements */
897           AAUp[offset] = 1.0/v[nz];
898           AALo[offset] = 1.0/v[nz];
899 
900           offset+=1;
901           if (nz>0) {
902             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
903             for (j=offset; j<offset+nz; j++) {
904               AAUp[j] = -AAUp[j];
905               AALo[j] = AAUp[j]/v[nz];
906             }
907             offset+=nz;
908           }
909         }
910         if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
911         if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
912         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
913         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
914         ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
915       }
916       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
917       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
918     } catch(char *ex) {
919       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
920     }
921   }
922   PetscFunctionReturn(0);
923 }
924 
925 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
926 {
927   PetscErrorCode               ierr;
928   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
929   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
930   IS                           ip = a->row;
931   PetscBool                    perm_identity;
932   PetscInt                     n = A->rmap->n;
933 
934   PetscFunctionBegin;
935   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
936   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
937   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
938   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
939 
940   A->offloadmask = PETSC_OFFLOAD_BOTH;
941 
942   /* lower triangular indices */
943   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
944   if (!perm_identity) {
945     IS             iip;
946     const PetscInt *irip,*rip;
947 
948     ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr);
949     ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr);
950     ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
951     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
952     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
953     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
954     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
955     ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr);
956     ierr = ISDestroy(&iip);CHKERRQ(ierr);
957     ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
958     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
959   }
960   PetscFunctionReturn(0);
961 }
962 
963 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
964 {
965   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
966   IS             ip = b->row;
967   PetscBool      perm_identity;
968   PetscErrorCode ierr;
969 
970   PetscFunctionBegin;
971   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
972   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
973   B->offloadmask = PETSC_OFFLOAD_CPU;
974   /* determine which version of MatSolve needs to be used. */
975   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
976   if (perm_identity) {
977     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
978     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
979     B->ops->matsolve = NULL;
980     B->ops->matsolvetranspose = NULL;
981   } else {
982     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
983     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
984     B->ops->matsolve = NULL;
985     B->ops->matsolvetranspose = NULL;
986   }
987 
988   /* get the triangular factors */
989   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
990   PetscFunctionReturn(0);
991 }
992 
993 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
994 {
995   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
996   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
997   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
998   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
999   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
1000   cusparseStatus_t                  stat;
1001   cusparseIndexBase_t               indexBase;
1002   cusparseMatrixType_t              matrixType;
1003   cusparseFillMode_t                fillMode;
1004   cusparseDiagType_t                diagType;
1005   cudaError_t                       cerr;
1006   PetscErrorCode                    ierr;
1007 
1008   PetscFunctionBegin;
1009   /* allocate space for the transpose of the lower triangular factor */
1010   ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr);
1011   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1012 
1013   /* set the matrix descriptors of the lower triangular factor */
1014   matrixType = cusparseGetMatType(loTriFactor->descr);
1015   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
1016   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1017     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1018   diagType = cusparseGetMatDiagType(loTriFactor->descr);
1019 
1020   /* Create the matrix description */
1021   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
1022   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1023   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1024   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1025   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1026 
1027   /* set the operation */
1028   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1029 
1030   /* allocate GPU space for the CSC of the lower triangular factor*/
1031   loTriFactorT->csrMat = new CsrMatrix;
1032   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1033   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1034   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1035   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
1036   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1037   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1038 
1039   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1040 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1041   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1042                                        loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1043                                        loTriFactor->csrMat->values->data().get(),
1044                                        loTriFactor->csrMat->row_offsets->data().get(),
1045                                        loTriFactor->csrMat->column_indices->data().get(),
1046                                        loTriFactorT->csrMat->values->data().get(),
1047                                        loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1048                                        CUSPARSE_ACTION_NUMERIC,indexBase,
1049                                        CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1050   cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1051 #endif
1052 
1053   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1054   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1055                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1056                           loTriFactor->csrMat->values->data().get(),
1057                           loTriFactor->csrMat->row_offsets->data().get(),
1058                           loTriFactor->csrMat->column_indices->data().get(),
1059                           loTriFactorT->csrMat->values->data().get(),
1060                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1061                           loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1062                           CUSPARSE_ACTION_NUMERIC, indexBase,
1063                           CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat);
1064                         #else
1065                           loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1066                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1067                         #endif
1068   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1069   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1070 
1071   /* Create the solve analysis information */
1072   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1073   stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1074 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1075   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
1076                                 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1077                                 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1078                                 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
1079                                 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1080   cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1081 #endif
1082 
1083   /* perform the solve analysis */
1084   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
1085                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1086                            loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1087                            loTriFactorT->csrMat->column_indices->data().get(),
1088                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1089                            loTriFactorT->solveInfo,
1090                            loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1091                           #else
1092                            loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1093                           #endif
1094   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1095   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1096 
1097   /* assign the pointer */
1098   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1099 
1100   /*********************************************/
1101   /* Now the Transpose of the Upper Tri Factor */
1102   /*********************************************/
1103 
1104   /* allocate space for the transpose of the upper triangular factor */
1105   ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr);
1106   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1107 
1108   /* set the matrix descriptors of the upper triangular factor */
1109   matrixType = cusparseGetMatType(upTriFactor->descr);
1110   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1111   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1112     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1113   diagType = cusparseGetMatDiagType(upTriFactor->descr);
1114 
1115   /* Create the matrix description */
1116   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
1117   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1118   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1119   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1120   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1121 
1122   /* set the operation */
1123   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1124 
1125   /* allocate GPU space for the CSC of the upper triangular factor*/
1126   upTriFactorT->csrMat = new CsrMatrix;
1127   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1128   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1129   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1130   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1131   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1132   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1133 
1134   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1135 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1136   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1137                                 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1138                                 upTriFactor->csrMat->values->data().get(),
1139                                 upTriFactor->csrMat->row_offsets->data().get(),
1140                                 upTriFactor->csrMat->column_indices->data().get(),
1141                                 upTriFactorT->csrMat->values->data().get(),
1142                                 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1143                                 CUSPARSE_ACTION_NUMERIC,indexBase,
1144                                 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1145   cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1146 #endif
1147 
1148   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1149   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1150                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1151                           upTriFactor->csrMat->values->data().get(),
1152                           upTriFactor->csrMat->row_offsets->data().get(),
1153                           upTriFactor->csrMat->column_indices->data().get(),
1154                           upTriFactorT->csrMat->values->data().get(),
1155                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1156                           upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1157                           CUSPARSE_ACTION_NUMERIC, indexBase,
1158                           CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat);
1159                         #else
1160                           upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1161                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1162                         #endif
1163 
1164   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1165   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1166 
1167   /* Create the solve analysis information */
1168   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1169   stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1170   #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1171   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1172                                  upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1173                                  upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1174                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
1175                                  &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1176   cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1177   #endif
1178 
1179   /* perform the solve analysis */
1180   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1181                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1182                            upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1183                            upTriFactorT->csrMat->column_indices->data().get(),
1184                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1185                            upTriFactorT->solveInfo,
1186                            upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1187                           #else
1188                            upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1189                           #endif
1190 
1191   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1192   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1193 
1194   /* assign the pointer */
1195   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1196   PetscFunctionReturn(0);
1197 }
1198 
1199 struct PetscScalarToPetscInt
1200 {
1201   __host__ __device__
1202   PetscInt operator()(PetscScalar s)
1203   {
1204     return (PetscInt)PetscRealPart(s);
1205   }
1206 };
1207 
1208 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTransposeForMult(Mat A)
1209 {
1210   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1211   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1212   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1213   cusparseStatus_t             stat;
1214   cusparseIndexBase_t          indexBase;
1215   cudaError_t                  err;
1216   PetscErrorCode               ierr;
1217 
1218   PetscFunctionBegin;
1219   if (!A->form_explicit_transpose || !A->rmap->n || !A->cmap->n) PetscFunctionReturn(0);
1220   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1221   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1222   if (!matstruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing mat struct");
1223   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1224   if (A->transupdated && !matstructT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing matTranspose struct");
1225   if (A->transupdated) PetscFunctionReturn(0);
1226   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1227   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1228   if (cusparsestruct->format != MAT_CUSPARSE_CSR) {
1229     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
1230   }
1231   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1232     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1233     stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
1234     indexBase = cusparseGetMatIndexBase(matstruct->descr);
1235     stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
1236     stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1237 
1238     /* set alpha and beta */
1239     err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1240     err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1241     err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1242     err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1243     err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1244     err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1245 
1246     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1247       CsrMatrix *matrixT = new CsrMatrix;
1248       matstructT->mat = matrixT;
1249       matrixT->num_rows = A->cmap->n;
1250       matrixT->num_cols = A->rmap->n;
1251       matrixT->num_entries = a->nz;
1252       matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1253       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1254       matrixT->values = new THRUSTARRAY(a->nz);
1255 
1256       if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); }
1257       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
1258 
1259      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1260       stat = cusparseCreateCsr(&matstructT->matDescr,
1261                                matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1262                                matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1263                                matrixT->values->data().get(),
1264                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1265                                indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
1266      #endif
1267     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1268    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1269       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1270    #else
1271       CsrMatrix *temp  = new CsrMatrix;
1272       CsrMatrix *tempT = new CsrMatrix;
1273       /* First convert HYB to CSR */
1274       temp->num_rows = A->rmap->n;
1275       temp->num_cols = A->cmap->n;
1276       temp->num_entries = a->nz;
1277       temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1278       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1279       temp->values = new THRUSTARRAY(a->nz);
1280 
1281       stat = cusparse_hyb2csr(cusparsestruct->handle,
1282                               matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1283                               temp->values->data().get(),
1284                               temp->row_offsets->data().get(),
1285                               temp->column_indices->data().get());CHKERRCUSPARSE(stat);
1286 
1287       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1288       tempT->num_rows = A->rmap->n;
1289       tempT->num_cols = A->cmap->n;
1290       tempT->num_entries = a->nz;
1291       tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1292       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1293       tempT->values = new THRUSTARRAY(a->nz);
1294 
1295       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1296                               temp->num_cols, temp->num_entries,
1297                               temp->values->data().get(),
1298                               temp->row_offsets->data().get(),
1299                               temp->column_indices->data().get(),
1300                               tempT->values->data().get(),
1301                               tempT->column_indices->data().get(),
1302                               tempT->row_offsets->data().get(),
1303                               CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1304 
1305       /* Last, convert CSC to HYB */
1306       cusparseHybMat_t hybMat;
1307       stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1308       cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1309         CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1310       stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1311                               matstructT->descr, tempT->values->data().get(),
1312                               tempT->row_offsets->data().get(),
1313                               tempT->column_indices->data().get(),
1314                               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1315 
1316       /* assign the pointer */
1317       matstructT->mat = hybMat;
1318       A->transupdated = PETSC_TRUE;
1319       /* delete temporaries */
1320       if (tempT) {
1321         if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1322         if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1323         if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1324         delete (CsrMatrix*) tempT;
1325       }
1326       if (temp) {
1327         if (temp->values) delete (THRUSTARRAY*) temp->values;
1328         if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1329         if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1330         delete (CsrMatrix*) temp;
1331       }
1332      #endif
1333     }
1334   }
1335   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1336     CsrMatrix *matrix  = (CsrMatrix*)matstruct->mat;
1337     CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat;
1338     if (!matrix) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix");
1339     if (!matrix->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix rows");
1340     if (!matrix->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix cols");
1341     if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix values");
1342     if (!matrixT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT");
1343     if (!matrixT->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT rows");
1344     if (!matrixT->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT cols");
1345     if (!matrixT->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT values");
1346     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1347       cusparsestruct->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
1348       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
1349       ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
1350     }
1351     if (!cusparsestruct->csr2csc_i) {
1352       THRUSTARRAY csr2csc_a(matrix->num_entries);
1353       PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1354 
1355       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1356      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1357       void   *csr2cscBuffer;
1358       size_t csr2cscBufferSize;
1359       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1360                                            A->cmap->n, matrix->num_entries,
1361                                            matrix->values->data().get(),
1362                                            cusparsestruct->rowoffsets_gpu->data().get(),
1363                                            matrix->column_indices->data().get(),
1364                                            matrixT->values->data().get(),
1365                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1366                                            CUSPARSE_ACTION_NUMERIC,indexBase,
1367                                            cusparsestruct->csr2cscAlg, &csr2cscBufferSize);CHKERRCUSPARSE(stat);
1368       err = cudaMalloc(&csr2cscBuffer,csr2cscBufferSize);CHKERRCUDA(err);
1369      #endif
1370 
1371       if (matrix->num_entries) {
1372         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
1373            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
1374            I checked every parameters and they were just fine. I have no clue why cusparse complains.
1375 
1376            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
1377            should be filled with indexBase. So I just take a shortcut here.
1378         */
1379         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1380                               A->cmap->n,matrix->num_entries,
1381                               csr2csc_a.data().get(),
1382                               cusparsestruct->rowoffsets_gpu->data().get(),
1383                               matrix->column_indices->data().get(),
1384                               matrixT->values->data().get(),
1385                              #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1386                               matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1387                               CUSPARSE_ACTION_NUMERIC,indexBase,
1388                               cusparsestruct->csr2cscAlg, csr2cscBuffer);CHKERRCUSPARSE(stat);
1389                              #else
1390                               matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1391                               CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1392                              #endif
1393       } else {
1394         matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
1395       }
1396 
1397       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1398       PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt()));
1399      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1400       err = cudaFree(csr2cscBuffer);CHKERRCUDA(err);
1401      #endif
1402     }
1403     PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()),
1404                                                      thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()),
1405                                                      matrixT->values->begin()));
1406   }
1407   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1408   ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1409   /* the compressed row indices is not used for matTranspose */
1410   matstructT->cprowIndices = NULL;
1411   /* assign the pointer */
1412   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1413   A->transupdated = PETSC_TRUE;
1414   PetscFunctionReturn(0);
1415 }
1416 
1417 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1418 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1419 {
1420   PetscInt                              n = xx->map->n;
1421   const PetscScalar                     *barray;
1422   PetscScalar                           *xarray;
1423   thrust::device_ptr<const PetscScalar> bGPU;
1424   thrust::device_ptr<PetscScalar>       xGPU;
1425   cusparseStatus_t                      stat;
1426   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1427   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1428   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1429   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1430   PetscErrorCode                        ierr;
1431 
1432   PetscFunctionBegin;
1433   /* Analyze the matrix and create the transpose ... on the fly */
1434   if (!loTriFactorT && !upTriFactorT) {
1435     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1436     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1437     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1438   }
1439 
1440   /* Get the GPU pointers */
1441   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1442   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1443   xGPU = thrust::device_pointer_cast(xarray);
1444   bGPU = thrust::device_pointer_cast(barray);
1445 
1446   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1447   /* First, reorder with the row permutation */
1448   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1449                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1450                xGPU);
1451 
1452   /* First, solve U */
1453   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1454                         upTriFactorT->csrMat->num_rows,
1455                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1456                         upTriFactorT->csrMat->num_entries,
1457                       #endif
1458                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1459                         upTriFactorT->csrMat->values->data().get(),
1460                         upTriFactorT->csrMat->row_offsets->data().get(),
1461                         upTriFactorT->csrMat->column_indices->data().get(),
1462                         upTriFactorT->solveInfo,
1463                         xarray,
1464                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1465                         tempGPU->data().get(),
1466                         upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1467                       #else
1468                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1469                       #endif
1470 
1471   /* Then, solve L */
1472   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1473                         loTriFactorT->csrMat->num_rows,
1474                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1475                         loTriFactorT->csrMat->num_entries,
1476                       #endif
1477                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1478                         loTriFactorT->csrMat->values->data().get(),
1479                         loTriFactorT->csrMat->row_offsets->data().get(),
1480                         loTriFactorT->csrMat->column_indices->data().get(),
1481                         loTriFactorT->solveInfo,
1482                         tempGPU->data().get(),
1483                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1484                         xarray,
1485                         loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1486                       #else
1487                          xarray);CHKERRCUSPARSE(stat);
1488                       #endif
1489 
1490   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1491   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1492                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1493                tempGPU->begin());
1494 
1495   /* Copy the temporary to the full solution. */
1496   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU);
1497 
1498   /* restore */
1499   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1500   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1501   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1502   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1503   PetscFunctionReturn(0);
1504 }
1505 
1506 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1507 {
1508   const PetscScalar                 *barray;
1509   PetscScalar                       *xarray;
1510   cusparseStatus_t                  stat;
1511   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1512   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1513   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1514   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1515   PetscErrorCode                    ierr;
1516 
1517   PetscFunctionBegin;
1518   /* Analyze the matrix and create the transpose ... on the fly */
1519   if (!loTriFactorT && !upTriFactorT) {
1520     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1521     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1522     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1523   }
1524 
1525   /* Get the GPU pointers */
1526   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1527   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1528 
1529   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1530   /* First, solve U */
1531   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1532                         upTriFactorT->csrMat->num_rows,
1533                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1534                         upTriFactorT->csrMat->num_entries,
1535                       #endif
1536                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1537                         upTriFactorT->csrMat->values->data().get(),
1538                         upTriFactorT->csrMat->row_offsets->data().get(),
1539                         upTriFactorT->csrMat->column_indices->data().get(),
1540                         upTriFactorT->solveInfo,
1541                         barray,
1542                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1543                         tempGPU->data().get(),
1544                         upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1545                       #else
1546                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1547                       #endif
1548 
1549   /* Then, solve L */
1550   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1551                         loTriFactorT->csrMat->num_rows,
1552                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1553                         loTriFactorT->csrMat->num_entries,
1554                       #endif
1555                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1556                         loTriFactorT->csrMat->values->data().get(),
1557                         loTriFactorT->csrMat->row_offsets->data().get(),
1558                         loTriFactorT->csrMat->column_indices->data().get(),
1559                         loTriFactorT->solveInfo,
1560                         tempGPU->data().get(),
1561                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1562                         xarray,
1563                         loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1564                       #else
1565                         xarray);CHKERRCUSPARSE(stat);
1566                       #endif
1567 
1568   /* restore */
1569   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1570   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1571   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1572   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1573   PetscFunctionReturn(0);
1574 }
1575 
1576 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1577 {
1578   const PetscScalar                     *barray;
1579   PetscScalar                           *xarray;
1580   thrust::device_ptr<const PetscScalar> bGPU;
1581   thrust::device_ptr<PetscScalar>       xGPU;
1582   cusparseStatus_t                      stat;
1583   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1584   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1585   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1586   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1587   PetscErrorCode                        ierr;
1588 
1589   PetscFunctionBegin;
1590 
1591   /* Get the GPU pointers */
1592   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1593   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1594   xGPU = thrust::device_pointer_cast(xarray);
1595   bGPU = thrust::device_pointer_cast(barray);
1596 
1597   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1598   /* First, reorder with the row permutation */
1599   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1600                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1601                tempGPU->begin());
1602 
1603   /* Next, solve L */
1604   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1605                         loTriFactor->csrMat->num_rows,
1606                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1607                         loTriFactor->csrMat->num_entries,
1608                       #endif
1609                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1610                         loTriFactor->csrMat->values->data().get(),
1611                         loTriFactor->csrMat->row_offsets->data().get(),
1612                         loTriFactor->csrMat->column_indices->data().get(),
1613                         loTriFactor->solveInfo,
1614                         tempGPU->data().get(),
1615                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1616                          xarray,
1617                          loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1618                       #else
1619                          xarray);CHKERRCUSPARSE(stat);
1620                       #endif
1621 
1622   /* Then, solve U */
1623   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1624                         upTriFactor->csrMat->num_rows,
1625                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1626                         upTriFactor->csrMat->num_entries,
1627                       #endif
1628                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1629                         upTriFactor->csrMat->values->data().get(),
1630                         upTriFactor->csrMat->row_offsets->data().get(),
1631                         upTriFactor->csrMat->column_indices->data().get(),
1632                         upTriFactor->solveInfo,xarray,
1633                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1634                         tempGPU->data().get(),
1635                         upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1636                       #else
1637                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1638                       #endif
1639 
1640   /* Last, reorder with the column permutation */
1641   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1642                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
1643                xGPU);
1644 
1645   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1646   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1647   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1648   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1649   PetscFunctionReturn(0);
1650 }
1651 
1652 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1653 {
1654   const PetscScalar                 *barray;
1655   PetscScalar                       *xarray;
1656   cusparseStatus_t                  stat;
1657   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1658   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1659   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1660   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1661   PetscErrorCode                    ierr;
1662 
1663   PetscFunctionBegin;
1664   /* Get the GPU pointers */
1665   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1666   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1667 
1668   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1669   /* First, solve L */
1670   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1671                         loTriFactor->csrMat->num_rows,
1672                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1673                         loTriFactor->csrMat->num_entries,
1674                       #endif
1675                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1676                         loTriFactor->csrMat->values->data().get(),
1677                         loTriFactor->csrMat->row_offsets->data().get(),
1678                         loTriFactor->csrMat->column_indices->data().get(),
1679                         loTriFactor->solveInfo,
1680                         barray,
1681                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1682                         tempGPU->data().get(),
1683                         loTriFactor->solvePolicy,loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1684                       #else
1685                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1686                       #endif
1687 
1688   /* Next, solve U */
1689   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1690                         upTriFactor->csrMat->num_rows,
1691                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1692                         upTriFactor->csrMat->num_entries,
1693                       #endif
1694                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1695                         upTriFactor->csrMat->values->data().get(),
1696                         upTriFactor->csrMat->row_offsets->data().get(),
1697                         upTriFactor->csrMat->column_indices->data().get(),
1698                         upTriFactor->solveInfo,
1699                         tempGPU->data().get(),
1700                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1701                         xarray,
1702                         upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1703                       #else
1704                         xarray);CHKERRCUSPARSE(stat);
1705                       #endif
1706 
1707   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1708   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1709   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1710   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1711   PetscFunctionReturn(0);
1712 }
1713 
1714 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
1715 {
1716   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1717   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1718   cudaError_t        cerr;
1719   PetscErrorCode     ierr;
1720 
1721   PetscFunctionBegin;
1722   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
1723     CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat;
1724 
1725     ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
1726     cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1727     cerr = WaitForCUDA();CHKERRCUDA(cerr);
1728     ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr);
1729     ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
1730     A->offloadmask = PETSC_OFFLOAD_BOTH;
1731   }
1732   PetscFunctionReturn(0);
1733 }
1734 
1735 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1736 {
1737   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
1738   PetscErrorCode ierr;
1739 
1740   PetscFunctionBegin;
1741   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
1742   *array = a->a;
1743   A->offloadmask = PETSC_OFFLOAD_CPU;
1744   PetscFunctionReturn(0);
1745 }
1746 
1747 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1748 {
1749   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1750   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
1751   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1752   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
1753   PetscErrorCode               ierr;
1754   cusparseStatus_t             stat;
1755   PetscBool                    both = PETSC_TRUE;
1756   cudaError_t                  err;
1757 
1758   PetscFunctionBegin;
1759   if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Cannot copy to GPU");
1760   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1761     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
1762       CsrMatrix *matrix;
1763       matrix = (CsrMatrix*)cusparsestruct->mat->mat;
1764 
1765       if (a->nz && !a->a) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR values");
1766       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1767       matrix->values->assign(a->a, a->a+a->nz);
1768       err  = WaitForCUDA();CHKERRCUDA(err);
1769       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
1770       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1771       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
1772     } else {
1773       PetscInt nnz;
1774       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1775       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr);
1776       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
1777       delete cusparsestruct->workVector;
1778       delete cusparsestruct->rowoffsets_gpu;
1779       cusparsestruct->workVector = NULL;
1780       cusparsestruct->rowoffsets_gpu = NULL;
1781       try {
1782         if (a->compressedrow.use) {
1783           m    = a->compressedrow.nrows;
1784           ii   = a->compressedrow.i;
1785           ridx = a->compressedrow.rindex;
1786         } else {
1787           m    = A->rmap->n;
1788           ii   = a->i;
1789           ridx = NULL;
1790         }
1791         if (!ii) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR row data");
1792         if (m && !a->j) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR column data");
1793         if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; }
1794         else nnz = a->nz;
1795 
1796         /* create cusparse matrix */
1797         cusparsestruct->nrows = m;
1798         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1799         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
1800         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
1801         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1802 
1803         err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1804         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1805         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1806         err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1807         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1808         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1809         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1810 
1811         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1812         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1813           /* set the matrix */
1814           CsrMatrix *mat= new CsrMatrix;
1815           mat->num_rows = m;
1816           mat->num_cols = A->cmap->n;
1817           mat->num_entries = nnz;
1818           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1819           mat->row_offsets->assign(ii, ii + m+1);
1820 
1821           mat->column_indices = new THRUSTINTARRAY32(nnz);
1822           mat->column_indices->assign(a->j, a->j+nnz);
1823 
1824           mat->values = new THRUSTARRAY(nnz);
1825           if (a->a) mat->values->assign(a->a, a->a+nnz);
1826 
1827           /* assign the pointer */
1828           matstruct->mat = mat;
1829          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1830           if (mat->num_rows) { /* cusparse errors on empty matrices! */
1831             stat = cusparseCreateCsr(&matstruct->matDescr,
1832                                     mat->num_rows, mat->num_cols, mat->num_entries,
1833                                     mat->row_offsets->data().get(), mat->column_indices->data().get(),
1834                                     mat->values->data().get(),
1835                                     CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1836                                     CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
1837           }
1838          #endif
1839         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1840          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1841           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1842          #else
1843           CsrMatrix *mat= new CsrMatrix;
1844           mat->num_rows = m;
1845           mat->num_cols = A->cmap->n;
1846           mat->num_entries = nnz;
1847           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1848           mat->row_offsets->assign(ii, ii + m+1);
1849 
1850           mat->column_indices = new THRUSTINTARRAY32(nnz);
1851           mat->column_indices->assign(a->j, a->j+nnz);
1852 
1853           mat->values = new THRUSTARRAY(nnz);
1854           if (a->a) mat->values->assign(a->a, a->a+nnz);
1855 
1856           cusparseHybMat_t hybMat;
1857           stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1858           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1859             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1860           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
1861               matstruct->descr, mat->values->data().get(),
1862               mat->row_offsets->data().get(),
1863               mat->column_indices->data().get(),
1864               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1865           /* assign the pointer */
1866           matstruct->mat = hybMat;
1867 
1868           if (mat) {
1869             if (mat->values) delete (THRUSTARRAY*)mat->values;
1870             if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1871             if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1872             delete (CsrMatrix*)mat;
1873           }
1874          #endif
1875         }
1876 
1877         /* assign the compressed row indices */
1878         if (a->compressedrow.use) {
1879           cusparsestruct->workVector = new THRUSTARRAY(m);
1880           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
1881           matstruct->cprowIndices->assign(ridx,ridx+m);
1882           tmp = m;
1883         } else {
1884           cusparsestruct->workVector = NULL;
1885           matstruct->cprowIndices    = NULL;
1886           tmp = 0;
1887         }
1888         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
1889 
1890         /* assign the pointer */
1891         cusparsestruct->mat = matstruct;
1892       } catch(char *ex) {
1893         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1894       }
1895       err  = WaitForCUDA();CHKERRCUDA(err);
1896       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1897       cusparsestruct->nonzerostate = A->nonzerostate;
1898     }
1899     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
1900   }
1901   PetscFunctionReturn(0);
1902 }
1903 
1904 struct VecCUDAPlusEquals
1905 {
1906   template <typename Tuple>
1907   __host__ __device__
1908   void operator()(Tuple t)
1909   {
1910     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1911   }
1912 };
1913 
1914 struct VecCUDAEquals
1915 {
1916   template <typename Tuple>
1917   __host__ __device__
1918   void operator()(Tuple t)
1919   {
1920     thrust::get<1>(t) = thrust::get<0>(t);
1921   }
1922 };
1923 
1924 struct VecCUDAEqualsReverse
1925 {
1926   template <typename Tuple>
1927   __host__ __device__
1928   void operator()(Tuple t)
1929   {
1930     thrust::get<0>(t) = thrust::get<1>(t);
1931   }
1932 };
1933 
1934 struct MatMatCusparse {
1935   PetscBool             cisdense;
1936   PetscScalar           *Bt;
1937   Mat                   X;
1938   PetscBool             reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
1939   PetscLogDouble        flops;
1940   CsrMatrix             *Bcsr;
1941 
1942 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1943   cusparseSpMatDescr_t  matSpBDescr;
1944   PetscBool             initialized;   /* C = alpha op(A) op(B) + beta C */
1945   cusparseDnMatDescr_t  matBDescr;
1946   cusparseDnMatDescr_t  matCDescr;
1947   PetscInt              Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
1948  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
1949   void                  *dBuffer4;
1950   void                  *dBuffer5;
1951  #endif
1952   size_t                mmBufferSize;
1953   void                  *mmBuffer;
1954   void                  *mmBuffer2; /* SpGEMM WorkEstimation buffer */
1955   cusparseSpGEMMDescr_t spgemmDesc;
1956 #endif
1957 };
1958 
1959 static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
1960 {
1961   PetscErrorCode   ierr;
1962   MatMatCusparse   *mmdata = (MatMatCusparse *)data;
1963   cudaError_t      cerr;
1964  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1965   cusparseStatus_t stat;
1966  #endif
1967 
1968   PetscFunctionBegin;
1969   cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr);
1970   delete mmdata->Bcsr;
1971  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1972   if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); }
1973   if (mmdata->matBDescr)   { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); }
1974   if (mmdata->matCDescr)   { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); }
1975   if (mmdata->spgemmDesc)  { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); }
1976  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
1977   if (mmdata->dBuffer4)  { cerr = cudaFree(mmdata->dBuffer4);CHKERRCUDA(cerr); }
1978   if (mmdata->dBuffer5)  { cerr = cudaFree(mmdata->dBuffer5);CHKERRCUDA(cerr); }
1979  #endif
1980   if (mmdata->mmBuffer)  { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); }
1981   if (mmdata->mmBuffer2) { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); }
1982  #endif
1983   ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr);
1984   ierr = PetscFree(data);CHKERRQ(ierr);
1985   PetscFunctionReturn(0);
1986 }
1987 
1988 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
1989 
1990 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1991 {
1992   Mat_Product                  *product = C->product;
1993   Mat                          A,B;
1994   PetscInt                     m,n,blda,clda;
1995   PetscBool                    flg,biscuda;
1996   Mat_SeqAIJCUSPARSE           *cusp;
1997   cusparseStatus_t             stat;
1998   cusparseOperation_t          opA;
1999   const PetscScalar            *barray;
2000   PetscScalar                  *carray;
2001   PetscErrorCode               ierr;
2002   MatMatCusparse               *mmdata;
2003   Mat_SeqAIJCUSPARSEMultStruct *mat;
2004   CsrMatrix                    *csrmat;
2005 
2006   PetscFunctionBegin;
2007   MatCheckProduct(C,1);
2008   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
2009   mmdata = (MatMatCusparse*)product->data;
2010   A    = product->A;
2011   B    = product->B;
2012   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2013   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2014   /* currently CopyToGpu does not copy if the matrix is bound to CPU
2015      Instead of silently accepting the wrong answer, I prefer to raise the error */
2016   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2017   ierr   = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2018   cusp   = (Mat_SeqAIJCUSPARSE*)A->spptr;
2019   switch (product->type) {
2020   case MATPRODUCT_AB:
2021   case MATPRODUCT_PtAP:
2022     mat = cusp->mat;
2023     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2024     m   = A->rmap->n;
2025     n   = B->cmap->n;
2026     break;
2027   case MATPRODUCT_AtB:
2028     if (!A->form_explicit_transpose) {
2029       mat = cusp->mat;
2030       opA = CUSPARSE_OPERATION_TRANSPOSE;
2031     } else {
2032       ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr);
2033       mat  = cusp->matTranspose;
2034       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
2035     }
2036     m = A->cmap->n;
2037     n = B->cmap->n;
2038     break;
2039   case MATPRODUCT_ABt:
2040   case MATPRODUCT_RARt:
2041     mat = cusp->mat;
2042     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2043     m   = A->rmap->n;
2044     n   = B->rmap->n;
2045     break;
2046   default:
2047     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2048   }
2049   if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing Mat_SeqAIJCUSPARSEMultStruct");
2050   csrmat = (CsrMatrix*)mat->mat;
2051   /* if the user passed a CPU matrix, copy the data to the GPU */
2052   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr);
2053   if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);}
2054   ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr);
2055 
2056   ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr);
2057   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2058     ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2059     ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr);
2060   } else {
2061     ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr);
2062     ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr);
2063   }
2064 
2065   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2066  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2067   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2068   /* (re)allocate mmBuffer if not initialized or LDAs are different */
2069   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2070     size_t mmBufferSize;
2071     if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;}
2072     if (!mmdata->matBDescr) {
2073       stat         = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2074       mmdata->Blda = blda;
2075     }
2076 
2077     if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;}
2078     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2079       stat         = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2080       mmdata->Clda = clda;
2081     }
2082 
2083     if (!mat->matDescr) {
2084       stat = cusparseCreateCsr(&mat->matDescr,
2085                                csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
2086                                csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
2087                                csrmat->values->data().get(),
2088                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2089                                CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
2090     }
2091     stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
2092                                    mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2093                                    mmdata->matCDescr,cusparse_scalartype,
2094                                    cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat);
2095     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2096       cudaError_t cerr;
2097       cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr);
2098       cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr);
2099       mmdata->mmBufferSize = mmBufferSize;
2100     }
2101     mmdata->initialized = PETSC_TRUE;
2102   } else {
2103     /* to be safe, always update pointers of the mats */
2104     stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat);
2105     stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat);
2106     stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat);
2107   }
2108 
2109   /* do cusparseSpMM, which supports transpose on B */
2110   stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
2111                       mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2112                       mmdata->matCDescr,cusparse_scalartype,
2113                       cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2114  #else
2115   PetscInt k;
2116   /* cusparseXcsrmm does not support transpose on B */
2117   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2118     cublasHandle_t cublasv2handle;
2119     cublasStatus_t cerr;
2120 
2121     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
2122     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
2123                        B->cmap->n,B->rmap->n,
2124                        &PETSC_CUSPARSE_ONE ,barray,blda,
2125                        &PETSC_CUSPARSE_ZERO,barray,blda,
2126                        mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr);
2127     blda = B->cmap->n;
2128     k    = B->cmap->n;
2129   } else {
2130     k    = B->rmap->n;
2131   }
2132 
2133   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2134   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
2135                            csrmat->num_entries,mat->alpha_one,mat->descr,
2136                            csrmat->values->data().get(),
2137                            csrmat->row_offsets->data().get(),
2138                            csrmat->column_indices->data().get(),
2139                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
2140                            carray,clda);CHKERRCUSPARSE(stat);
2141  #endif
2142   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2143   ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr);
2144   ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr);
2145   if (product->type == MATPRODUCT_RARt) {
2146     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2147     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2148   } else if (product->type == MATPRODUCT_PtAP) {
2149     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2150     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2151   } else {
2152     ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr);
2153   }
2154   if (mmdata->cisdense) {
2155     ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr);
2156   }
2157   if (!biscuda) {
2158     ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
2159   }
2160   PetscFunctionReturn(0);
2161 }
2162 
2163 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2164 {
2165   Mat_Product        *product = C->product;
2166   Mat                A,B;
2167   PetscInt           m,n;
2168   PetscBool          cisdense,flg;
2169   PetscErrorCode     ierr;
2170   MatMatCusparse     *mmdata;
2171   Mat_SeqAIJCUSPARSE *cusp;
2172 
2173   PetscFunctionBegin;
2174   MatCheckProduct(C,1);
2175   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
2176   A    = product->A;
2177   B    = product->B;
2178   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2179   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2180   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2181   if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2182   switch (product->type) {
2183   case MATPRODUCT_AB:
2184     m = A->rmap->n;
2185     n = B->cmap->n;
2186     break;
2187   case MATPRODUCT_AtB:
2188     m = A->cmap->n;
2189     n = B->cmap->n;
2190     break;
2191   case MATPRODUCT_ABt:
2192     m = A->rmap->n;
2193     n = B->rmap->n;
2194     break;
2195   case MATPRODUCT_PtAP:
2196     m = B->cmap->n;
2197     n = B->cmap->n;
2198     break;
2199   case MATPRODUCT_RARt:
2200     m = B->rmap->n;
2201     n = B->rmap->n;
2202     break;
2203   default:
2204     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2205   }
2206   ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2207   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2208   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr);
2209   ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr);
2210 
2211   /* product data */
2212   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2213   mmdata->cisdense = cisdense;
2214  #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
2215   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2216   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2217     cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
2218   }
2219  #endif
2220   /* for these products we need intermediate storage */
2221   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2222     ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr);
2223     ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr);
2224     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2225       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr);
2226     } else {
2227       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr);
2228     }
2229   }
2230   C->product->data    = mmdata;
2231   C->product->destroy = MatDestroy_MatMatCusparse;
2232 
2233   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2234   PetscFunctionReturn(0);
2235 }
2236 
2237 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2238 {
2239   Mat_Product                  *product = C->product;
2240   Mat                          A,B;
2241   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2242   Mat_SeqAIJ                   *c = (Mat_SeqAIJ*)C->data;
2243   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2244   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2245   PetscBool                    flg;
2246   PetscErrorCode               ierr;
2247   cusparseStatus_t             stat;
2248   cudaError_t                  cerr;
2249   MatProductType               ptype;
2250   MatMatCusparse               *mmdata;
2251 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2252   cusparseSpMatDescr_t         BmatSpDescr;
2253 #endif
2254   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2255 
2256   PetscFunctionBegin;
2257   MatCheckProduct(C,1);
2258   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
2259   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2260   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for C of type %s",((PetscObject)C)->type_name);
2261   mmdata = (MatMatCusparse*)C->product->data;
2262   A = product->A;
2263   B = product->B;
2264   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2265     mmdata->reusesym = PETSC_FALSE;
2266     Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2267     if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2268     Cmat = Ccusp->mat;
2269     if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]);
2270     Ccsr = (CsrMatrix*)Cmat->mat;
2271     if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
2272     goto finalize;
2273   }
2274   if (!c->nz) goto finalize;
2275   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2276   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2277   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2278   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
2279   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2280   if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2281   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2282   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2283   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2284   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2285   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2286   if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2287   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2288   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2289 
2290   ptype = product->type;
2291   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
2292   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
2293   switch (ptype) {
2294   case MATPRODUCT_AB:
2295     Amat = Acusp->mat;
2296     Bmat = Bcusp->mat;
2297     break;
2298   case MATPRODUCT_AtB:
2299     Amat = Acusp->matTranspose;
2300     Bmat = Bcusp->mat;
2301     break;
2302   case MATPRODUCT_ABt:
2303     Amat = Acusp->mat;
2304     Bmat = Bcusp->matTranspose;
2305     break;
2306   default:
2307     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2308   }
2309   Cmat = Ccusp->mat;
2310   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2311   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2312   if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[ptype]);
2313   Acsr = (CsrMatrix*)Amat->mat;
2314   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */
2315   Ccsr = (CsrMatrix*)Cmat->mat;
2316   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
2317   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
2318   if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
2319   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2320 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2321   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2322   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2323   #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2324     stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB,
2325                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2326                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2327                                mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2328   #else
2329     stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2330                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2331                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2332                                mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2333     stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB,
2334                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2335                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2336   #endif
2337 #else
2338   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB,
2339                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2340                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2341                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2342                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2343 #endif
2344   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2345   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2346   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2347   C->offloadmask = PETSC_OFFLOAD_GPU;
2348 finalize:
2349   /* shorter version of MatAssemblyEnd_SeqAIJ */
2350   ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr);
2351   ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
2352   ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr);
2353   c->reallocs         = 0;
2354   C->info.mallocs    += 0;
2355   C->info.nz_unneeded = 0;
2356   C->assembled = C->was_assembled = PETSC_TRUE;
2357   C->num_ass++;
2358   PetscFunctionReturn(0);
2359 }
2360 
2361 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2362 {
2363   Mat_Product                  *product = C->product;
2364   Mat                          A,B;
2365   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2366   Mat_SeqAIJ                   *a,*b,*c;
2367   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2368   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2369   PetscInt                     i,j,m,n,k;
2370   PetscBool                    flg;
2371   PetscErrorCode               ierr;
2372   cusparseStatus_t             stat;
2373   cudaError_t                  cerr;
2374   MatProductType               ptype;
2375   MatMatCusparse               *mmdata;
2376   PetscLogDouble               flops;
2377   PetscBool                    biscompressed,ciscompressed;
2378 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2379   int64_t                      C_num_rows1, C_num_cols1, C_nnz1;
2380   cusparseSpMatDescr_t         BmatSpDescr;
2381 #else
2382   int                          cnz;
2383 #endif
2384   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2385 
2386   PetscFunctionBegin;
2387   MatCheckProduct(C,1);
2388   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
2389   A    = product->A;
2390   B    = product->B;
2391   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2392   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2393   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2394   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
2395   a = (Mat_SeqAIJ*)A->data;
2396   b = (Mat_SeqAIJ*)B->data;
2397   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2398   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2399   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2400   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2401 
2402   /* product data */
2403   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2404   C->product->data    = mmdata;
2405   C->product->destroy = MatDestroy_MatMatCusparse;
2406 
2407   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2408   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2409   ptype = product->type;
2410   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
2411   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
2412   biscompressed = PETSC_FALSE;
2413   ciscompressed = PETSC_FALSE;
2414   switch (ptype) {
2415   case MATPRODUCT_AB:
2416     m = A->rmap->n;
2417     n = B->cmap->n;
2418     k = A->cmap->n;
2419     Amat = Acusp->mat;
2420     Bmat = Bcusp->mat;
2421     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2422     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2423     break;
2424   case MATPRODUCT_AtB:
2425     m = A->cmap->n;
2426     n = B->cmap->n;
2427     k = A->rmap->n;
2428     ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr);
2429     Amat = Acusp->matTranspose;
2430     Bmat = Bcusp->mat;
2431     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2432     break;
2433   case MATPRODUCT_ABt:
2434     m = A->rmap->n;
2435     n = B->rmap->n;
2436     k = A->cmap->n;
2437     ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(B);CHKERRQ(ierr);
2438     Amat = Acusp->mat;
2439     Bmat = Bcusp->matTranspose;
2440     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2441     break;
2442   default:
2443     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2444   }
2445 
2446   /* create cusparse matrix */
2447   ierr  = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2448   ierr  = MatSetType(C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2449   c     = (Mat_SeqAIJ*)C->data;
2450   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2451   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
2452   Ccsr  = new CsrMatrix;
2453 
2454   c->compressedrow.use = ciscompressed;
2455   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2456     c->compressedrow.nrows = a->compressedrow.nrows;
2457     ierr = PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);CHKERRQ(ierr);
2458     ierr = PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);CHKERRQ(ierr);
2459     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
2460     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2461     Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows);
2462   } else {
2463     c->compressedrow.nrows  = 0;
2464     c->compressedrow.i      = NULL;
2465     c->compressedrow.rindex = NULL;
2466     Ccusp->workVector       = NULL;
2467     Cmat->cprowIndices      = NULL;
2468   }
2469   Ccusp->nrows    = ciscompressed ? c->compressedrow.nrows : m;
2470   Ccusp->mat      = Cmat;
2471   Ccusp->mat->mat = Ccsr;
2472   Ccsr->num_rows    = Ccusp->nrows;
2473   Ccsr->num_cols    = n;
2474   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1);
2475   stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
2476   stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
2477   stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
2478   cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
2479   cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
2480   cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
2481   cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2482   cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2483   cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2484   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
2485     thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0);
2486     c->nz = 0;
2487     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2488     Ccsr->values = new THRUSTARRAY(c->nz);
2489     goto finalizesym;
2490   }
2491 
2492   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2493   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2494   Acsr = (CsrMatrix*)Amat->mat;
2495   if (!biscompressed) {
2496     Bcsr = (CsrMatrix*)Bmat->mat;
2497 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2498     BmatSpDescr = Bmat->matDescr;
2499 #endif
2500   } else { /* we need to use row offsets for the full matrix */
2501     CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat;
2502     Bcsr = new CsrMatrix;
2503     Bcsr->num_rows       = B->rmap->n;
2504     Bcsr->num_cols       = cBcsr->num_cols;
2505     Bcsr->num_entries    = cBcsr->num_entries;
2506     Bcsr->column_indices = cBcsr->column_indices;
2507     Bcsr->values         = cBcsr->values;
2508     if (!Bcusp->rowoffsets_gpu) {
2509       Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
2510       Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
2511       ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
2512     }
2513     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2514     mmdata->Bcsr = Bcsr;
2515 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2516     if (Bcsr->num_rows && Bcsr->num_cols) {
2517       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries,
2518                                Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2519                                Bcsr->values->data().get(),
2520                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2521                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2522     }
2523     BmatSpDescr = mmdata->matSpBDescr;
2524 #endif
2525   }
2526   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
2527   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
2528   /* precompute flops count */
2529   if (ptype == MATPRODUCT_AB) {
2530     for (i=0, flops = 0; i<A->rmap->n; i++) {
2531       const PetscInt st = a->i[i];
2532       const PetscInt en = a->i[i+1];
2533       for (j=st; j<en; j++) {
2534         const PetscInt brow = a->j[j];
2535         flops += 2.*(b->i[brow+1] - b->i[brow]);
2536       }
2537     }
2538   } else if (ptype == MATPRODUCT_AtB) {
2539     for (i=0, flops = 0; i<A->rmap->n; i++) {
2540       const PetscInt anzi = a->i[i+1] - a->i[i];
2541       const PetscInt bnzi = b->i[i+1] - b->i[i];
2542       flops += (2.*anzi)*bnzi;
2543     }
2544   } else { /* TODO */
2545     flops = 0.;
2546   }
2547 
2548   mmdata->flops = flops;
2549   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2550 
2551 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2552   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2553   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0,
2554                           NULL, NULL, NULL,
2555                           CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2556                           CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2557   stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2558  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2559  {
2560   /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
2561      We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
2562   */
2563   void*  dBuffer1 = NULL;
2564   void*  dBuffer2 = NULL;
2565   void*  dBuffer3 = NULL;
2566   /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
2567   size_t bufferSize1 = 0;
2568   size_t bufferSize2 = 0;
2569   size_t bufferSize3 = 0;
2570   size_t bufferSize4 = 0;
2571   size_t bufferSize5 = 0;
2572 
2573   /*----------------------------------------------------------------------*/
2574   /* ask bufferSize1 bytes for external memory */
2575   stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2576                                             CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2577                                             &bufferSize1, NULL);CHKERRCUSPARSE(stat);
2578   cerr = cudaMalloc((void**) &dBuffer1, bufferSize1);CHKERRCUDA(cerr);
2579   /* inspect the matrices A and B to understand the memory requirement for the next step */
2580   stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2581                                             CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2582                                             &bufferSize1, dBuffer1);CHKERRCUSPARSE(stat);
2583 
2584   /*----------------------------------------------------------------------*/
2585   stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2586                                  CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2587                                  &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);CHKERRCUSPARSE(stat);
2588   cerr = cudaMalloc((void**) &dBuffer2, bufferSize2);CHKERRCUDA(cerr);
2589   cerr = cudaMalloc((void**) &dBuffer3, bufferSize3);CHKERRCUDA(cerr);
2590   cerr = cudaMalloc((void**) &mmdata->dBuffer4, bufferSize4);CHKERRCUDA(cerr);
2591   stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2592                                  CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2593                                  &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);CHKERRCUSPARSE(stat);
2594   cerr = cudaFree(dBuffer1);CHKERRCUDA(cerr);
2595   cerr = cudaFree(dBuffer2);CHKERRCUDA(cerr);
2596 
2597   /*----------------------------------------------------------------------*/
2598   /* get matrix C non-zero entries C_nnz1 */
2599   stat  = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat);
2600   c->nz = (PetscInt) C_nnz1;
2601   /* allocate matrix C */
2602   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2603   Ccsr->values         = new THRUSTARRAY(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2604   /* update matC with the new pointers */
2605   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2606                                 Ccsr->values->data().get());CHKERRCUSPARSE(stat);
2607 
2608   /*----------------------------------------------------------------------*/
2609   stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2610                                   CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2611                                   &bufferSize5, NULL);CHKERRCUSPARSE(stat);
2612   cerr = cudaMalloc((void**) &mmdata->dBuffer5, bufferSize5);CHKERRCUDA(cerr);
2613   stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2614                                   CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2615                                   &bufferSize5, mmdata->dBuffer5);CHKERRCUSPARSE(stat);
2616   cerr = cudaFree(dBuffer3);CHKERRCUDA(cerr);
2617   stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB,
2618                                      Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2619                                      cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2620                                      mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2621   ierr = PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufferSize4/1024,bufferSize5/1024);CHKERRQ(ierr);
2622  }
2623  #else // ~PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2624   size_t bufSize2;
2625   /* ask bufferSize bytes for external memory */
2626   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB,
2627                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2628                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2629                                        mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat);
2630   cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUDA(cerr);
2631   /* inspect the matrices A and B to understand the memory requirement for the next step */
2632   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB,
2633                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2634                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2635                                        mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat);
2636   /* ask bufferSize again bytes for external memory */
2637   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2638                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2639                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2640                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat);
2641   /* The CUSPARSE documentation is not clear, nor the API
2642      We need both buffers to perform the operations properly!
2643      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2644      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2645      is stored in the descriptor! What a messy API... */
2646   cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUDA(cerr);
2647   /* compute the intermediate product of A * B */
2648   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2649                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2650                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2651                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2652   /* get matrix C non-zero entries C_nnz1 */
2653   stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat);
2654   c->nz = (PetscInt) C_nnz1;
2655   ierr = PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024);CHKERRQ(ierr);
2656   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2657   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2658   Ccsr->values = new THRUSTARRAY(c->nz);
2659   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2660   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2661                                 Ccsr->values->data().get());CHKERRCUSPARSE(stat);
2662   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB,
2663                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2664                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2665  #endif
2666 #else
2667   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
2668   stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB,
2669                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2670                              Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2671                              Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2672                              Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat);
2673   c->nz = cnz;
2674   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2675   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2676   Ccsr->values = new THRUSTARRAY(c->nz);
2677   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2678 
2679   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2680   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
2681      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
2682      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
2683   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB,
2684                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2685                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2686                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2687                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2688 #endif
2689   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2690   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2691 finalizesym:
2692   c->singlemalloc = PETSC_FALSE;
2693   c->free_a       = PETSC_TRUE;
2694   c->free_ij      = PETSC_TRUE;
2695   ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
2696   ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
2697   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
2698     PetscInt *d_i = c->i;
2699     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
2700     THRUSTINTARRAY jj(Ccsr->column_indices->size());
2701     ii   = *Ccsr->row_offsets;
2702     jj   = *Ccsr->column_indices;
2703     if (ciscompressed) d_i = c->compressedrow.i;
2704     cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2705     cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2706   } else {
2707     PetscInt *d_i = c->i;
2708     if (ciscompressed) d_i = c->compressedrow.i;
2709     cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2710     cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2711   }
2712   if (ciscompressed) { /* need to expand host row offsets */
2713     PetscInt r = 0;
2714     c->i[0] = 0;
2715     for (k = 0; k < c->compressedrow.nrows; k++) {
2716       const PetscInt next = c->compressedrow.rindex[k];
2717       const PetscInt old = c->compressedrow.i[k];
2718       for (; r < next; r++) c->i[r+1] = old;
2719     }
2720     for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows];
2721   }
2722   ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
2723   ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
2724   ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
2725   c->maxnz = c->nz;
2726   c->nonzerorowcnt = 0;
2727   c->rmax = 0;
2728   for (k = 0; k < m; k++) {
2729     const PetscInt nn = c->i[k+1] - c->i[k];
2730     c->ilen[k] = c->imax[k] = nn;
2731     c->nonzerorowcnt += (PetscInt)!!nn;
2732     c->rmax = PetscMax(c->rmax,nn);
2733   }
2734   ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr);
2735   ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
2736   Ccsr->num_entries = c->nz;
2737 
2738   C->nonzerostate++;
2739   ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr);
2740   ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr);
2741   Ccusp->nonzerostate = C->nonzerostate;
2742   C->offloadmask   = PETSC_OFFLOAD_UNALLOCATED;
2743   C->preallocated  = PETSC_TRUE;
2744   C->assembled     = PETSC_FALSE;
2745   C->was_assembled = PETSC_FALSE;
2746   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
2747     mmdata->reusesym = PETSC_TRUE;
2748     C->offloadmask   = PETSC_OFFLOAD_GPU;
2749   }
2750   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2751   PetscFunctionReturn(0);
2752 }
2753 
2754 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
2755 
2756 /* handles sparse or dense B */
2757 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
2758 {
2759   Mat_Product    *product = mat->product;
2760   PetscErrorCode ierr;
2761   PetscBool      isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE;
2762 
2763   PetscFunctionBegin;
2764   MatCheckProduct(mat,1);
2765   ierr = PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);CHKERRQ(ierr);
2766   if (!product->A->boundtocpu && !product->B->boundtocpu) {
2767     ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);CHKERRQ(ierr);
2768   }
2769   if (product->type == MATPRODUCT_ABC) {
2770     Ciscusp = PETSC_FALSE;
2771     if (!product->C->boundtocpu) {
2772       ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);CHKERRQ(ierr);
2773     }
2774   }
2775   if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
2776     PetscBool usecpu = PETSC_FALSE;
2777     switch (product->type) {
2778     case MATPRODUCT_AB:
2779       if (product->api_user) {
2780         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMult","Mat");CHKERRQ(ierr);
2781         ierr = PetscOptionsBool("-matmatmult_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2782         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2783       } else {
2784         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AB","Mat");CHKERRQ(ierr);
2785         ierr = PetscOptionsBool("-matproduct_ab_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2786         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2787       }
2788       break;
2789     case MATPRODUCT_AtB:
2790       if (product->api_user) {
2791         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatTransposeMatMult","Mat");CHKERRQ(ierr);
2792         ierr = PetscOptionsBool("-mattransposematmult_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2793         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2794       } else {
2795         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AtB","Mat");CHKERRQ(ierr);
2796         ierr = PetscOptionsBool("-matproduct_atb_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2797         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2798       }
2799       break;
2800     case MATPRODUCT_PtAP:
2801       if (product->api_user) {
2802         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatPtAP","Mat");CHKERRQ(ierr);
2803         ierr = PetscOptionsBool("-matptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2804         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2805       } else {
2806         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_PtAP","Mat");CHKERRQ(ierr);
2807         ierr = PetscOptionsBool("-matproduct_ptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2808         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2809       }
2810       break;
2811     case MATPRODUCT_RARt:
2812       if (product->api_user) {
2813         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatRARt","Mat");CHKERRQ(ierr);
2814         ierr = PetscOptionsBool("-matrart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2815         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2816       } else {
2817         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_RARt","Mat");CHKERRQ(ierr);
2818         ierr = PetscOptionsBool("-matproduct_rart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2819         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2820       }
2821       break;
2822     case MATPRODUCT_ABC:
2823       if (product->api_user) {
2824         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMatMult","Mat");CHKERRQ(ierr);
2825         ierr = PetscOptionsBool("-matmatmatmult_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2826         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2827       } else {
2828         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_ABC","Mat");CHKERRQ(ierr);
2829         ierr = PetscOptionsBool("-matproduct_abc_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2830         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2831       }
2832       break;
2833     default:
2834       break;
2835     }
2836     if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
2837   }
2838   /* dispatch */
2839   if (isdense) {
2840     switch (product->type) {
2841     case MATPRODUCT_AB:
2842     case MATPRODUCT_AtB:
2843     case MATPRODUCT_ABt:
2844     case MATPRODUCT_PtAP:
2845     case MATPRODUCT_RARt:
2846      if (product->A->boundtocpu) {
2847         ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(mat);CHKERRQ(ierr);
2848       } else {
2849         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2850       }
2851       break;
2852     case MATPRODUCT_ABC:
2853       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2854       break;
2855     default:
2856       break;
2857     }
2858   } else if (Biscusp && Ciscusp) {
2859     switch (product->type) {
2860     case MATPRODUCT_AB:
2861     case MATPRODUCT_AtB:
2862     case MATPRODUCT_ABt:
2863       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2864       break;
2865     case MATPRODUCT_PtAP:
2866     case MATPRODUCT_RARt:
2867     case MATPRODUCT_ABC:
2868       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2869       break;
2870     default:
2871       break;
2872     }
2873   } else { /* fallback for AIJ */
2874     ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr);
2875   }
2876   PetscFunctionReturn(0);
2877 }
2878 
2879 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2880 {
2881   PetscErrorCode ierr;
2882 
2883   PetscFunctionBegin;
2884   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2885   PetscFunctionReturn(0);
2886 }
2887 
2888 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
2889 {
2890   PetscErrorCode ierr;
2891 
2892   PetscFunctionBegin;
2893   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2894   PetscFunctionReturn(0);
2895 }
2896 
2897 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2898 {
2899   PetscErrorCode ierr;
2900 
2901   PetscFunctionBegin;
2902   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
2903   PetscFunctionReturn(0);
2904 }
2905 
2906 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2907 {
2908   PetscErrorCode ierr;
2909 
2910   PetscFunctionBegin;
2911   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
2912   PetscFunctionReturn(0);
2913 }
2914 
2915 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2916 {
2917   PetscErrorCode ierr;
2918 
2919   PetscFunctionBegin;
2920   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2921   PetscFunctionReturn(0);
2922 }
2923 
2924 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y)
2925 {
2926   int i = blockIdx.x*blockDim.x + threadIdx.x;
2927   if (i < n) y[idx[i]] += x[i];
2928 }
2929 
2930 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
2931 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
2932 {
2933   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
2934   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
2935   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
2936   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
2937   PetscErrorCode               ierr;
2938   cusparseStatus_t             stat;
2939   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2940   PetscBool                    compressed;
2941 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2942   PetscInt                     nx,ny;
2943 #endif
2944 
2945   PetscFunctionBegin;
2946   if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Hermitian and not transpose not supported");
2947   if (!a->nonzerorowcnt) {
2948     if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);}
2949     else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);}
2950     PetscFunctionReturn(0);
2951   }
2952   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
2953   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2954   if (!trans) {
2955     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2956     if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
2957   } else {
2958     if (herm || !A->form_explicit_transpose) {
2959       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
2960       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2961     } else {
2962       if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr);}
2963       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
2964     }
2965   }
2966   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
2967   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
2968 
2969   try {
2970     ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
2971     if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */
2972     else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */
2973 
2974     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2975     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2976       /* z = A x + beta y.
2977          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
2978          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
2979       */
2980       xptr = xarray;
2981       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
2982       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
2983      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2984       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
2985           allocated to accommodate different uses. So we get the length info directly from mat.
2986        */
2987       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2988         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2989         nx = mat->num_cols;
2990         ny = mat->num_rows;
2991       }
2992      #endif
2993     } else {
2994       /* z = A^T x + beta y
2995          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
2996          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
2997        */
2998       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
2999       dptr = zarray;
3000       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3001       if (compressed) { /* Scatter x to work vector */
3002         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3003         thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
3004                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3005                          VecCUDAEqualsReverse());
3006       }
3007      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3008       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3009         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3010         nx = mat->num_rows;
3011         ny = mat->num_cols;
3012       }
3013      #endif
3014     }
3015 
3016     /* csr_spmv does y = alpha op(A) x + beta y */
3017     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3018      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3019       if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
3020       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
3021         cudaError_t cerr;
3022         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
3023         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
3024         stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
3025                                 matstruct->matDescr,
3026                                 matstruct->cuSpMV[opA].vecXDescr, beta,
3027                                 matstruct->cuSpMV[opA].vecYDescr,
3028                                 cusparse_scalartype,
3029                                 cusparsestruct->spmvAlg,
3030                                 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat);
3031         cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr);
3032 
3033         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3034       } else {
3035         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
3036         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat);
3037         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat);
3038       }
3039 
3040       stat = cusparseSpMV(cusparsestruct->handle, opA,
3041                                matstruct->alpha_one,
3042                                matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTransposeForMult() */
3043                                matstruct->cuSpMV[opA].vecXDescr,
3044                                beta,
3045                                matstruct->cuSpMV[opA].vecYDescr,
3046                                cusparse_scalartype,
3047                                cusparsestruct->spmvAlg,
3048                                matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat);
3049      #else
3050       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3051       stat = cusparse_csr_spmv(cusparsestruct->handle, opA,
3052                                mat->num_rows, mat->num_cols,
3053                                mat->num_entries, matstruct->alpha_one, matstruct->descr,
3054                                mat->values->data().get(), mat->row_offsets->data().get(),
3055                                mat->column_indices->data().get(), xptr, beta,
3056                                dptr);CHKERRCUSPARSE(stat);
3057      #endif
3058     } else {
3059       if (cusparsestruct->nrows) {
3060        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3061         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3062        #else
3063         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
3064         stat = cusparse_hyb_spmv(cusparsestruct->handle, opA,
3065                                  matstruct->alpha_one, matstruct->descr, hybMat,
3066                                  xptr, beta,
3067                                  dptr);CHKERRCUSPARSE(stat);
3068        #endif
3069       }
3070     }
3071     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3072 
3073     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3074       if (yy) { /* MatMultAdd: zz = A*xx + yy */
3075         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3076           ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */
3077         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3078           ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
3079         }
3080       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3081         ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
3082       }
3083 
3084       /* ScatterAdd the result from work vector into the full vector when A is compressed */
3085       if (compressed) {
3086         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3087         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred)
3088            and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
3089            prevent that. So I just add a ScatterAdd kernel.
3090          */
3091        #if 0
3092         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
3093         thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
3094                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
3095                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3096                          VecCUDAPlusEquals());
3097        #else
3098         PetscInt n = matstruct->cprowIndices->size();
3099         ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray);
3100        #endif
3101         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3102       }
3103     } else {
3104       if (yy && yy != zz) {
3105         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
3106       }
3107     }
3108     ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
3109     if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);}
3110     else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);}
3111   } catch(char *ex) {
3112     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
3113   }
3114   if (yy) {
3115     ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
3116   } else {
3117     ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr);
3118   }
3119   PetscFunctionReturn(0);
3120 }
3121 
3122 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
3123 {
3124   PetscErrorCode ierr;
3125 
3126   PetscFunctionBegin;
3127   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
3128   PetscFunctionReturn(0);
3129 }
3130 
3131 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
3132 {
3133   PetscErrorCode     ierr;
3134   PetscObjectState   onnz = A->nonzerostate;
3135   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3136 
3137   PetscFunctionBegin;
3138   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
3139   if (onnz != A->nonzerostate && cusp->deviceMat) {
3140     cudaError_t cerr;
3141 
3142     ierr = PetscInfo(A,"Destroy device mat since nonzerostate changed\n");CHKERRQ(ierr);
3143     cerr = cudaFree(cusp->deviceMat);CHKERRCUDA(cerr);
3144     cusp->deviceMat = NULL;
3145   }
3146   PetscFunctionReturn(0);
3147 }
3148 
3149 /* --------------------------------------------------------------------------------*/
3150 /*@
3151    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
3152    (the default parallel PETSc format). This matrix will ultimately pushed down
3153    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
3154    assembly performance the user should preallocate the matrix storage by setting
3155    the parameter nz (or the array nnz).  By setting these parameters accurately,
3156    performance during matrix assembly can be increased by more than a factor of 50.
3157 
3158    Collective
3159 
3160    Input Parameters:
3161 +  comm - MPI communicator, set to PETSC_COMM_SELF
3162 .  m - number of rows
3163 .  n - number of columns
3164 .  nz - number of nonzeros per row (same for all rows)
3165 -  nnz - array containing the number of nonzeros in the various rows
3166          (possibly different for each row) or NULL
3167 
3168    Output Parameter:
3169 .  A - the matrix
3170 
3171    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
3172    MatXXXXSetPreallocation() paradgm instead of this routine directly.
3173    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
3174 
3175    Notes:
3176    If nnz is given then nz is ignored
3177 
3178    The AIJ format (also called the Yale sparse matrix format or
3179    compressed row storage), is fully compatible with standard Fortran 77
3180    storage.  That is, the stored row and column indices can begin at
3181    either one (as in Fortran) or zero.  See the users' manual for details.
3182 
3183    Specify the preallocated storage with either nz or nnz (not both).
3184    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
3185    allocation.  For large problems you MUST preallocate memory or you
3186    will get TERRIBLE performance, see the users' manual chapter on matrices.
3187 
3188    By default, this format uses inodes (identical nodes) when possible, to
3189    improve numerical efficiency of matrix-vector products and solves. We
3190    search for consecutive rows with the same nonzero structure, thereby
3191    reusing matrix information to achieve increased efficiency.
3192 
3193    Level: intermediate
3194 
3195 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
3196 @*/
3197 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
3198 {
3199   PetscErrorCode ierr;
3200 
3201   PetscFunctionBegin;
3202   ierr = MatCreate(comm,A);CHKERRQ(ierr);
3203   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
3204   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3205   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
3206   PetscFunctionReturn(0);
3207 }
3208 
3209 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3210 {
3211   PetscErrorCode ierr;
3212 
3213   PetscFunctionBegin;
3214   if (A->factortype == MAT_FACTOR_NONE) {
3215     ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
3216   } else {
3217     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
3218   }
3219   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr);
3220   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
3221   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3222   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3223   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3224   ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
3225   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
3226   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
3227   ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaijcusparse_hypre_C",NULL);CHKERRQ(ierr);
3228   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
3229   PetscFunctionReturn(0);
3230 }
3231 
3232 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
3233 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
3234 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
3235 {
3236   PetscErrorCode ierr;
3237 
3238   PetscFunctionBegin;
3239   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
3240   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
3241   PetscFunctionReturn(0);
3242 }
3243 
3244 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str)
3245 {
3246   PetscErrorCode     ierr;
3247   Mat_SeqAIJ         *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
3248   Mat_SeqAIJCUSPARSE *cy;
3249   Mat_SeqAIJCUSPARSE *cx;
3250   PetscScalar        *ay;
3251   const PetscScalar  *ax;
3252   CsrMatrix          *csry,*csrx;
3253 
3254   PetscFunctionBegin;
3255   cy = (Mat_SeqAIJCUSPARSE*)Y->spptr;
3256   cx = (Mat_SeqAIJCUSPARSE*)X->spptr;
3257   if (X->ops->axpy != Y->ops->axpy) {
3258     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr);
3259     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
3260     PetscFunctionReturn(0);
3261   }
3262   /* if we are here, it means both matrices are bound to GPU */
3263   ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr);
3264   ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr);
3265   if (cy->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)Y),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
3266   if (cx->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
3267   csry = (CsrMatrix*)cy->mat->mat;
3268   csrx = (CsrMatrix*)cx->mat->mat;
3269   /* see if we can turn this into a cublas axpy */
3270   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3271     bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin());
3272     if (eq) {
3273       eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin());
3274     }
3275     if (eq) str = SAME_NONZERO_PATTERN;
3276   }
3277   /* spgeam is buggy with one column */
3278   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3279 
3280   if (str == SUBSET_NONZERO_PATTERN) {
3281     cusparseStatus_t stat;
3282     PetscScalar      b = 1.0;
3283 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3284     size_t           bufferSize;
3285     void             *buffer;
3286     cudaError_t      cerr;
3287 #endif
3288 
3289     ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr);
3290     ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3291     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
3292 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3293     stat = cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n,
3294                                           &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3295                                           &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3296                                              cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize);CHKERRCUSPARSE(stat);
3297     cerr = cudaMalloc(&buffer,bufferSize);CHKERRCUDA(cerr);
3298     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3299     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3300                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3301                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3302                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer);CHKERRCUSPARSE(stat);
3303     ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr);
3304     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3305     cerr = cudaFree(buffer);CHKERRCUDA(cerr);
3306 #else
3307     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3308     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3309                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3310                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3311                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get());CHKERRCUSPARSE(stat);
3312     ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr);
3313     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3314 #endif
3315     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
3316     ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr);
3317     ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3318     ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3319   } else if (str == SAME_NONZERO_PATTERN) {
3320     cublasHandle_t cublasv2handle;
3321     cublasStatus_t berr;
3322     PetscBLASInt   one = 1, bnz = 1;
3323 
3324     ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr);
3325     ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3326     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
3327     ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr);
3328     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3329     berr = cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one);CHKERRCUBLAS(berr);
3330     ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr);
3331     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3332     ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr);
3333     ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3334     ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3335   } else {
3336     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr);
3337     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
3338   }
3339   PetscFunctionReturn(0);
3340 }
3341 
3342 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a)
3343 {
3344   PetscErrorCode ierr;
3345   Mat_SeqAIJ     *y = (Mat_SeqAIJ*)Y->data;
3346   PetscScalar    *ay;
3347   cublasHandle_t cublasv2handle;
3348   cublasStatus_t berr;
3349   PetscBLASInt   one = 1, bnz = 1;
3350 
3351   PetscFunctionBegin;
3352   ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3353   ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
3354   ierr = PetscBLASIntCast(y->nz,&bnz);CHKERRQ(ierr);
3355   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3356   berr = cublasXscal(cublasv2handle,bnz,&a,ay,one);CHKERRCUBLAS(berr);
3357   ierr = PetscLogGpuFlops(bnz);CHKERRQ(ierr);
3358   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3359   ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3360   ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3361   PetscFunctionReturn(0);
3362 }
3363 
3364 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3365 {
3366   PetscErrorCode ierr;
3367   PetscBool      both = PETSC_FALSE;
3368   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3369 
3370   PetscFunctionBegin;
3371   if (A->factortype == MAT_FACTOR_NONE) {
3372     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
3373     if (spptr->mat) {
3374       CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat;
3375       if (matrix->values) {
3376         both = PETSC_TRUE;
3377         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3378       }
3379     }
3380     if (spptr->matTranspose) {
3381       CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat;
3382       if (matrix->values) {
3383         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3384       }
3385     }
3386   }
3387   //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr);
3388   ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr);
3389   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr);
3390   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3391   else A->offloadmask = PETSC_OFFLOAD_CPU;
3392   PetscFunctionReturn(0);
3393 }
3394 
3395 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
3396 {
3397   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3398   PetscErrorCode ierr;
3399 
3400   PetscFunctionBegin;
3401   if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0);
3402   if (flg) {
3403     ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
3404 
3405     A->ops->scale                     = MatScale_SeqAIJ;
3406     A->ops->axpy                      = MatAXPY_SeqAIJ;
3407     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3408     A->ops->mult                      = MatMult_SeqAIJ;
3409     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3410     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3411     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3412     A->ops->multhermitiantranspose    = NULL;
3413     A->ops->multhermitiantransposeadd = NULL;
3414     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
3415     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr);
3416     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3417     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3418     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
3419     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
3420     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr);
3421     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3422   } else {
3423     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
3424     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3425     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3426     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
3427     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
3428     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
3429     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
3430     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3431     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3432     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
3433     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE);CHKERRQ(ierr);
3434     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3435     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3436     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3437     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3438     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);CHKERRQ(ierr);
3439     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3440   }
3441   A->boundtocpu = flg;
3442   a->inode.use = flg;
3443   PetscFunctionReturn(0);
3444 }
3445 
3446 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
3447 {
3448   PetscErrorCode   ierr;
3449   cusparseStatus_t stat;
3450   Mat              B;
3451 
3452   PetscFunctionBegin;
3453   ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); /* first use of CUSPARSE may be via MatConvert */
3454   if (reuse == MAT_INITIAL_MATRIX) {
3455     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
3456   } else if (reuse == MAT_REUSE_MATRIX) {
3457     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
3458   }
3459   B = *newmat;
3460 
3461   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
3462   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
3463 
3464   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
3465     if (B->factortype == MAT_FACTOR_NONE) {
3466       Mat_SeqAIJCUSPARSE *spptr;
3467       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3468       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3469       stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
3470       spptr->format     = MAT_CUSPARSE_CSR;
3471      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3472      #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
3473       spptr->spmvAlg    = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
3474      #else
3475       spptr->spmvAlg    = CUSPARSE_CSRMV_ALG1;    /* default, since we only support csr */
3476      #endif
3477       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
3478       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
3479      #endif
3480       B->spptr = spptr;
3481     } else {
3482       Mat_SeqAIJCUSPARSETriFactors *spptr;
3483 
3484       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3485       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3486       stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
3487       B->spptr = spptr;
3488     }
3489     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
3490   }
3491   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
3492   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
3493   B->ops->setoption      = MatSetOption_SeqAIJCUSPARSE;
3494   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
3495   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
3496   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
3497 
3498   ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr);
3499   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3500   ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
3501 #if defined(PETSC_HAVE_HYPRE)
3502   ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijcusparse_hypre_C",MatConvert_AIJ_HYPRE);CHKERRQ(ierr);
3503 #endif
3504   PetscFunctionReturn(0);
3505 }
3506 
3507 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
3508 {
3509   PetscErrorCode ierr;
3510 
3511   PetscFunctionBegin;
3512   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
3513   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
3514   PetscFunctionReturn(0);
3515 }
3516 
3517 /*MC
3518    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
3519 
3520    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
3521    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
3522    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
3523 
3524    Options Database Keys:
3525 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
3526 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3527 -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3528 
3529   Level: beginner
3530 
3531 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
3532 M*/
3533 
3534 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*);
3535 
3536 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
3537 {
3538   PetscErrorCode ierr;
3539 
3540   PetscFunctionBegin;
3541   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSEBAND, MATSEQAIJ, MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band);CHKERRQ(ierr);
3542   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3543   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3544   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3545   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3546 
3547   PetscFunctionReturn(0);
3548 }
3549 
3550 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
3551 {
3552   PetscErrorCode   ierr;
3553   cusparseStatus_t stat;
3554 
3555   PetscFunctionBegin;
3556   if (*cusparsestruct) {
3557     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr);
3558     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr);
3559     delete (*cusparsestruct)->workVector;
3560     delete (*cusparsestruct)->rowoffsets_gpu;
3561     delete (*cusparsestruct)->cooPerm;
3562     delete (*cusparsestruct)->cooPerm_a;
3563     delete (*cusparsestruct)->csr2csc_i;
3564     if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);}
3565     ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr);
3566   }
3567   PetscFunctionReturn(0);
3568 }
3569 
3570 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
3571 {
3572   PetscFunctionBegin;
3573   if (*mat) {
3574     delete (*mat)->values;
3575     delete (*mat)->column_indices;
3576     delete (*mat)->row_offsets;
3577     delete *mat;
3578     *mat = 0;
3579   }
3580   PetscFunctionReturn(0);
3581 }
3582 
3583 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
3584 {
3585   cusparseStatus_t stat;
3586   PetscErrorCode   ierr;
3587 
3588   PetscFunctionBegin;
3589   if (*trifactor) {
3590     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
3591     if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
3592     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
3593     if ((*trifactor)->solveBuffer)   {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);}
3594     if ((*trifactor)->AA_h)   {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);}
3595    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3596     if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);}
3597    #endif
3598     ierr = PetscFree(*trifactor);CHKERRQ(ierr);
3599   }
3600   PetscFunctionReturn(0);
3601 }
3602 
3603 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
3604 {
3605   CsrMatrix        *mat;
3606   cusparseStatus_t stat;
3607   cudaError_t      err;
3608 
3609   PetscFunctionBegin;
3610   if (*matstruct) {
3611     if ((*matstruct)->mat) {
3612       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
3613        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3614         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3615        #else
3616         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
3617         stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
3618        #endif
3619       } else {
3620         mat = (CsrMatrix*)(*matstruct)->mat;
3621         CsrMatrix_Destroy(&mat);
3622       }
3623     }
3624     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
3625     delete (*matstruct)->cprowIndices;
3626     if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); }
3627     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
3628     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
3629 
3630    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3631     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
3632     if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);}
3633     for (int i=0; i<3; i++) {
3634       if (mdata->cuSpMV[i].initialized) {
3635         err  = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err);
3636         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat);
3637         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat);
3638       }
3639     }
3640    #endif
3641     delete *matstruct;
3642     *matstruct = NULL;
3643   }
3644   PetscFunctionReturn(0);
3645 }
3646 
3647 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p* trifactors)
3648 {
3649   PetscErrorCode ierr;
3650 
3651   PetscFunctionBegin;
3652   if (*trifactors) {
3653     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr);
3654     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr);
3655     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr);
3656     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr);
3657     delete (*trifactors)->rpermIndices;
3658     delete (*trifactors)->cpermIndices;
3659     delete (*trifactors)->workVector;
3660     (*trifactors)->rpermIndices = NULL;
3661     (*trifactors)->cpermIndices = NULL;
3662     (*trifactors)->workVector = NULL;
3663     if ((*trifactors)->a_band_d)   {cudaError_t cerr = cudaFree((*trifactors)->a_band_d);CHKERRCUDA(cerr);}
3664     if ((*trifactors)->i_band_d)   {cudaError_t cerr = cudaFree((*trifactors)->i_band_d);CHKERRCUDA(cerr);}
3665     (*trifactors)->init_dev_prop = PETSC_FALSE;
3666   }
3667   PetscFunctionReturn(0);
3668 }
3669 
3670 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
3671 {
3672   PetscErrorCode   ierr;
3673   cusparseHandle_t handle;
3674   cusparseStatus_t stat;
3675 
3676   PetscFunctionBegin;
3677   if (*trifactors) {
3678     ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr);
3679     if (handle = (*trifactors)->handle) {
3680       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
3681     }
3682     ierr = PetscFree(*trifactors);CHKERRQ(ierr);
3683   }
3684   PetscFunctionReturn(0);
3685 }
3686 
3687 struct IJCompare
3688 {
3689   __host__ __device__
3690   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3691   {
3692     if (t1.get<0>() < t2.get<0>()) return true;
3693     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3694     return false;
3695   }
3696 };
3697 
3698 struct IJEqual
3699 {
3700   __host__ __device__
3701   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3702   {
3703     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
3704     return true;
3705   }
3706 };
3707 
3708 struct IJDiff
3709 {
3710   __host__ __device__
3711   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3712   {
3713     return t1 == t2 ? 0 : 1;
3714   }
3715 };
3716 
3717 struct IJSum
3718 {
3719   __host__ __device__
3720   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3721   {
3722     return t1||t2;
3723   }
3724 };
3725 
3726 #include <thrust/iterator/discard_iterator.h>
3727 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
3728 {
3729   Mat_SeqAIJCUSPARSE                    *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3730   Mat_SeqAIJ                            *a = (Mat_SeqAIJ*)A->data;
3731   THRUSTARRAY                           *cooPerm_v = NULL;
3732   thrust::device_ptr<const PetscScalar> d_v;
3733   CsrMatrix                             *matrix;
3734   PetscErrorCode                        ierr;
3735   PetscInt                              n;
3736 
3737   PetscFunctionBegin;
3738   if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct");
3739   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix");
3740   if (!cusp->cooPerm) {
3741     ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3742     ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3743     PetscFunctionReturn(0);
3744   }
3745   matrix = (CsrMatrix*)cusp->mat->mat;
3746   if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3747   if (!v) {
3748     if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3749     goto finalize;
3750   }
3751   n = cusp->cooPerm->size();
3752   if (isCudaMem(v)) {
3753     d_v = thrust::device_pointer_cast(v);
3754   } else {
3755     cooPerm_v = new THRUSTARRAY(n);
3756     cooPerm_v->assign(v,v+n);
3757     d_v = cooPerm_v->data();
3758     ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
3759   }
3760   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3761   if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
3762     if (cusp->cooPerm_a) { /* there are repeated entries in d_v[], and we need to add these them */
3763       THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
3764       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3765       /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output)
3766         cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[].
3767         cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero.
3768       */
3769       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3770       thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>());
3771       delete cooPerm_w;
3772     } else {
3773       /* all nonzeros in d_v[] are unique entries */
3774       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3775                                                                 matrix->values->begin()));
3776       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3777                                                                 matrix->values->end()));
3778       thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]]  */
3779     }
3780   } else {
3781     if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
3782       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3783       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3784     } else {
3785       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3786                                                                 matrix->values->begin()));
3787       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3788                                                                 matrix->values->end()));
3789       thrust::for_each(zibit,zieit,VecCUDAEquals());
3790     }
3791   }
3792   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3793 finalize:
3794   delete cooPerm_v;
3795   A->offloadmask = PETSC_OFFLOAD_GPU;
3796   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3797   /* shorter version of MatAssemblyEnd_SeqAIJ */
3798   ierr = PetscInfo3(A,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",A->rmap->n,A->cmap->n,a->nz);CHKERRQ(ierr);
3799   ierr = PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
3800   ierr = PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rmax);CHKERRQ(ierr);
3801   a->reallocs         = 0;
3802   A->info.mallocs    += 0;
3803   A->info.nz_unneeded = 0;
3804   A->assembled = A->was_assembled = PETSC_TRUE;
3805   A->num_ass++;
3806   PetscFunctionReturn(0);
3807 }
3808 
3809 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
3810 {
3811   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3812   PetscErrorCode     ierr;
3813 
3814   PetscFunctionBegin;
3815   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3816   if (!cusp) PetscFunctionReturn(0);
3817   if (destroy) {
3818     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr);
3819     delete cusp->csr2csc_i;
3820     cusp->csr2csc_i = NULL;
3821   }
3822   A->transupdated = PETSC_FALSE;
3823   PetscFunctionReturn(0);
3824 }
3825 
3826 #include <thrust/binary_search.h>
3827 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
3828 {
3829   PetscErrorCode     ierr;
3830   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3831   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
3832   PetscInt           cooPerm_n, nzr = 0;
3833   cudaError_t        cerr;
3834 
3835   PetscFunctionBegin;
3836   ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr);
3837   ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr);
3838   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
3839   if (n != cooPerm_n) {
3840     delete cusp->cooPerm;
3841     delete cusp->cooPerm_a;
3842     cusp->cooPerm = NULL;
3843     cusp->cooPerm_a = NULL;
3844   }
3845   if (n) {
3846     THRUSTINTARRAY d_i(n);
3847     THRUSTINTARRAY d_j(n);
3848     THRUSTINTARRAY ii(A->rmap->n);
3849 
3850     if (!cusp->cooPerm)   { cusp->cooPerm   = new THRUSTINTARRAY(n); }
3851     if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); }
3852 
3853     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
3854     d_i.assign(coo_i,coo_i+n);
3855     d_j.assign(coo_j,coo_j+n);
3856 
3857     /* Ex.
3858       n = 6
3859       coo_i = [3,3,1,4,1,4]
3860       coo_j = [3,2,2,5,2,6]
3861     */
3862     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin()));
3863     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end()));
3864 
3865     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3866     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
3867     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */
3868     *cusp->cooPerm_a = d_i; /* copy the sorted array */
3869     THRUSTINTARRAY w = d_j;
3870 
3871     /*
3872       d_i     = [1,1,3,3,4,4]
3873       d_j     = [2,2,2,3,5,6]
3874       cooPerm = [2,4,1,0,3,5]
3875     */
3876     auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */
3877 
3878     /*
3879       d_i     = [1,3,3,4,4,x]
3880                             ^ekey
3881       d_j     = [2,2,3,5,6,x]
3882                            ^nekye
3883     */
3884     if (nekey == ekey) { /* all entries are unique */
3885       delete cusp->cooPerm_a;
3886       cusp->cooPerm_a = NULL;
3887     } else { /* Stefano: I couldn't come up with a more elegant algorithm */
3888       /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */
3889       adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/
3890       adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff());                                              /* w:         [2,2,2,3,5,6] => [2,0,0,1,1,1]*/
3891       (*cusp->cooPerm_a)[0] = 0; /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */
3892       w[0] = 0;
3893       thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); /* cooPerm_a =          [0,0,1,1,1,1]*/
3894       thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/
3895     }
3896     thrust::counting_iterator<PetscInt> search_begin(0);
3897     thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */
3898                         search_begin, search_begin + A->rmap->n,  /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */
3899                         ii.begin()); /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */
3900     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3901 
3902     ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr);
3903     a->singlemalloc = PETSC_FALSE;
3904     a->free_a       = PETSC_TRUE;
3905     a->free_ij      = PETSC_TRUE;
3906     ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr);
3907     a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */
3908     cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
3909     a->nz = a->maxnz = a->i[A->rmap->n];
3910     a->rmax = 0;
3911     ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr);
3912     ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr);
3913     cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
3914     if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); }
3915     if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); }
3916     for (PetscInt i = 0; i < A->rmap->n; i++) {
3917       const PetscInt nnzr = a->i[i+1] - a->i[i];
3918       nzr += (PetscInt)!!(nnzr);
3919       a->ilen[i] = a->imax[i] = nnzr;
3920       a->rmax = PetscMax(a->rmax,nnzr);
3921     }
3922     a->nonzerorowcnt = nzr;
3923     A->preallocated = PETSC_TRUE;
3924     ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr);
3925     ierr = MatMarkDiagonal_SeqAIJ(A);CHKERRQ(ierr);
3926   } else {
3927     ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr);
3928   }
3929   ierr = MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr);
3930 
3931   /* We want to allocate the CUSPARSE struct for matvec now.
3932      The code is so convoluted now that I prefer to copy zeros */
3933   ierr = PetscArrayzero(a->a,a->nz);CHKERRQ(ierr);
3934   ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr);
3935   A->offloadmask = PETSC_OFFLOAD_CPU;
3936   A->nonzerostate++;
3937   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3938   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
3939 
3940   A->assembled = PETSC_FALSE;
3941   A->was_assembled = PETSC_FALSE;
3942   PetscFunctionReturn(0);
3943 }
3944 
3945 /*@C
3946     MatSeqAIJCUSPARSEGetIJ - returns the device row storage i and j indices for MATSEQAIJCUSPARSE matrices.
3947 
3948    Not collective
3949 
3950     Input Parameters:
3951 +   A - the matrix
3952 -   compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form
3953 
3954     Output Parameters:
3955 +   ia - the CSR row pointers
3956 -   ja - the CSR column indices
3957 
3958     Level: developer
3959 
3960     Notes:
3961       When compressed is true, the CSR structure does not contain empty rows
3962 
3963 .seealso: MatSeqAIJCUSPARSERestoreIJ(), MatSeqAIJCUSPARSEGetArrayRead()
3964 @*/
3965 PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int** i, const int **j)
3966 {
3967   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3968   CsrMatrix          *csr;
3969   PetscErrorCode     ierr;
3970   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
3971 
3972   PetscFunctionBegin;
3973   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3974   if (!i || !j) PetscFunctionReturn(0);
3975   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3976   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3977   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3978   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3979   csr = (CsrMatrix*)cusp->mat->mat;
3980   if (i) {
3981     if (!compressed && a->compressedrow.use) { /* need full row offset */
3982       if (!cusp->rowoffsets_gpu) {
3983         cusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
3984         cusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
3985         ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
3986       }
3987       *i = cusp->rowoffsets_gpu->data().get();
3988     } else *i = csr->row_offsets->data().get();
3989   }
3990   if (j) *j = csr->column_indices->data().get();
3991   PetscFunctionReturn(0);
3992 }
3993 
3994 /*@C
3995     MatSeqAIJCUSPARSERestoreIJ - restore the device row storage i and j indices obtained with MatSeqAIJCUSPARSEGetIJ()
3996 
3997    Not collective
3998 
3999     Input Parameters:
4000 +   A - the matrix
4001 -   compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form
4002 
4003     Output Parameters:
4004 +   ia - the CSR row pointers
4005 -   ja - the CSR column indices
4006 
4007     Level: developer
4008 
4009 .seealso: MatSeqAIJCUSPARSEGetIJ()
4010 @*/
4011 PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int** i, const int **j)
4012 {
4013   PetscFunctionBegin;
4014   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4015   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4016   if (i) *i = NULL;
4017   if (j) *j = NULL;
4018   PetscFunctionReturn(0);
4019 }
4020 
4021 /*@C
4022    MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
4023 
4024    Not Collective
4025 
4026    Input Parameter:
4027 .   A - a MATSEQAIJCUSPARSE matrix
4028 
4029    Output Parameter:
4030 .   a - pointer to the device data
4031 
4032    Level: developer
4033 
4034    Notes: may trigger host-device copies if up-to-date matrix data is on host
4035 
4036 .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArrayRead()
4037 @*/
4038 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a)
4039 {
4040   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4041   CsrMatrix          *csr;
4042   PetscErrorCode     ierr;
4043 
4044   PetscFunctionBegin;
4045   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4046   PetscValidPointer(a,2);
4047   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4048   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4049   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4050   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4051   csr = (CsrMatrix*)cusp->mat->mat;
4052   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4053   *a = csr->values->data().get();
4054   PetscFunctionReturn(0);
4055 }
4056 
4057 /*@C
4058    MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from MatSeqAIJCUSPARSEGetArrayRead()
4059 
4060    Not Collective
4061 
4062    Input Parameter:
4063 .   A - a MATSEQAIJCUSPARSE matrix
4064 
4065    Output Parameter:
4066 .   a - pointer to the device data
4067 
4068    Level: developer
4069 
4070 .seealso: MatSeqAIJCUSPARSEGetArrayRead()
4071 @*/
4072 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a)
4073 {
4074   PetscFunctionBegin;
4075   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4076   PetscValidPointer(a,2);
4077   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4078   *a = NULL;
4079   PetscFunctionReturn(0);
4080 }
4081 
4082 /*@C
4083    MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
4084 
4085    Not Collective
4086 
4087    Input Parameter:
4088 .   A - a MATSEQAIJCUSPARSE matrix
4089 
4090    Output Parameter:
4091 .   a - pointer to the device data
4092 
4093    Level: developer
4094 
4095    Notes: may trigger host-device copies if up-to-date matrix data is on host
4096 
4097 .seealso: MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArray()
4098 @*/
4099 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a)
4100 {
4101   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4102   CsrMatrix          *csr;
4103   PetscErrorCode     ierr;
4104 
4105   PetscFunctionBegin;
4106   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4107   PetscValidPointer(a,2);
4108   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4109   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4110   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4111   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4112   csr = (CsrMatrix*)cusp->mat->mat;
4113   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4114   *a = csr->values->data().get();
4115   A->offloadmask = PETSC_OFFLOAD_GPU;
4116   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
4117   PetscFunctionReturn(0);
4118 }
4119 /*@C
4120    MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from MatSeqAIJCUSPARSEGetArray()
4121 
4122    Not Collective
4123 
4124    Input Parameter:
4125 .   A - a MATSEQAIJCUSPARSE matrix
4126 
4127    Output Parameter:
4128 .   a - pointer to the device data
4129 
4130    Level: developer
4131 
4132 .seealso: MatSeqAIJCUSPARSEGetArray()
4133 @*/
4134 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a)
4135 {
4136   PetscErrorCode ierr;
4137 
4138   PetscFunctionBegin;
4139   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4140   PetscValidPointer(a,2);
4141   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4142   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
4143   *a = NULL;
4144   PetscFunctionReturn(0);
4145 }
4146 
4147 /*@C
4148    MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
4149 
4150    Not Collective
4151 
4152    Input Parameter:
4153 .   A - a MATSEQAIJCUSPARSE matrix
4154 
4155    Output Parameter:
4156 .   a - pointer to the device data
4157 
4158    Level: developer
4159 
4160    Notes: does not trigger host-device copies and flags data validity on the GPU
4161 
4162 .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSERestoreArrayWrite()
4163 @*/
4164 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a)
4165 {
4166   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4167   CsrMatrix          *csr;
4168   PetscErrorCode     ierr;
4169 
4170   PetscFunctionBegin;
4171   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4172   PetscValidPointer(a,2);
4173   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4174   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4175   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4176   csr = (CsrMatrix*)cusp->mat->mat;
4177   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4178   *a = csr->values->data().get();
4179   A->offloadmask = PETSC_OFFLOAD_GPU;
4180   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
4181   PetscFunctionReturn(0);
4182 }
4183 
4184 /*@C
4185    MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from MatSeqAIJCUSPARSEGetArrayWrite()
4186 
4187    Not Collective
4188 
4189    Input Parameter:
4190 .   A - a MATSEQAIJCUSPARSE matrix
4191 
4192    Output Parameter:
4193 .   a - pointer to the device data
4194 
4195    Level: developer
4196 
4197 .seealso: MatSeqAIJCUSPARSEGetArrayWrite()
4198 @*/
4199 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a)
4200 {
4201   PetscErrorCode ierr;
4202 
4203   PetscFunctionBegin;
4204   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4205   PetscValidPointer(a,2);
4206   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4207   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
4208   *a = NULL;
4209   PetscFunctionReturn(0);
4210 }
4211 
4212 struct IJCompare4
4213 {
4214   __host__ __device__
4215   inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4216   {
4217     if (t1.get<0>() < t2.get<0>()) return true;
4218     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
4219     return false;
4220   }
4221 };
4222 
4223 struct Shift
4224 {
4225   int _shift;
4226 
4227   Shift(int shift) : _shift(shift) {}
4228   __host__ __device__
4229   inline int operator() (const int &c)
4230   {
4231     return c + _shift;
4232   }
4233 };
4234 
4235 /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */
4236 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
4237 {
4238   PetscErrorCode               ierr;
4239   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c;
4240   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp;
4241   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
4242   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
4243   PetscInt                     Annz,Bnnz;
4244   cusparseStatus_t             stat;
4245   PetscInt                     i,m,n,zero = 0;
4246   cudaError_t                  cerr;
4247 
4248   PetscFunctionBegin;
4249   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4250   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
4251   PetscValidPointer(C,4);
4252   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4253   PetscCheckTypeName(B,MATSEQAIJCUSPARSE);
4254   if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n);
4255   if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
4256   if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4257   if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4258   if (reuse == MAT_INITIAL_MATRIX) {
4259     m     = A->rmap->n;
4260     n     = A->cmap->n + B->cmap->n;
4261     ierr  = MatCreate(PETSC_COMM_SELF,C);CHKERRQ(ierr);
4262     ierr  = MatSetSizes(*C,m,n,m,n);CHKERRQ(ierr);
4263     ierr  = MatSetType(*C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
4264     c     = (Mat_SeqAIJ*)(*C)->data;
4265     Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4266     Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
4267     Ccsr  = new CsrMatrix;
4268     Cmat->cprowIndices      = NULL;
4269     c->compressedrow.use    = PETSC_FALSE;
4270     c->compressedrow.nrows  = 0;
4271     c->compressedrow.i      = NULL;
4272     c->compressedrow.rindex = NULL;
4273     Ccusp->workVector       = NULL;
4274     Ccusp->nrows    = m;
4275     Ccusp->mat      = Cmat;
4276     Ccusp->mat->mat = Ccsr;
4277     Ccsr->num_rows  = m;
4278     Ccsr->num_cols  = n;
4279     stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
4280     stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4281     stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
4282     cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
4283     cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
4284     cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
4285     cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4286     cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4287     cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4288     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4289     ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
4290     ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr);
4291     ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(B);CHKERRQ(ierr);
4292     if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4293     if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4294 
4295     Acsr = (CsrMatrix*)Acusp->mat->mat;
4296     Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4297     Annz = (PetscInt)Acsr->column_indices->size();
4298     Bnnz = (PetscInt)Bcsr->column_indices->size();
4299     c->nz = Annz + Bnnz;
4300     Ccsr->row_offsets = new THRUSTINTARRAY32(m+1);
4301     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4302     Ccsr->values = new THRUSTARRAY(c->nz);
4303     Ccsr->num_entries = c->nz;
4304     Ccusp->cooPerm = new THRUSTINTARRAY(c->nz);
4305     if (c->nz) {
4306       auto Acoo = new THRUSTINTARRAY32(Annz);
4307       auto Bcoo = new THRUSTINTARRAY32(Bnnz);
4308       auto Ccoo = new THRUSTINTARRAY32(c->nz);
4309       THRUSTINTARRAY32 *Aroff,*Broff;
4310 
4311       if (a->compressedrow.use) { /* need full row offset */
4312         if (!Acusp->rowoffsets_gpu) {
4313           Acusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
4314           Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
4315           ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
4316         }
4317         Aroff = Acusp->rowoffsets_gpu;
4318       } else Aroff = Acsr->row_offsets;
4319       if (b->compressedrow.use) { /* need full row offset */
4320         if (!Bcusp->rowoffsets_gpu) {
4321           Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
4322           Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
4323           ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
4324         }
4325         Broff = Bcusp->rowoffsets_gpu;
4326       } else Broff = Bcsr->row_offsets;
4327       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4328       stat = cusparseXcsr2coo(Acusp->handle,
4329                               Aroff->data().get(),
4330                               Annz,
4331                               m,
4332                               Acoo->data().get(),
4333                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4334       stat = cusparseXcsr2coo(Bcusp->handle,
4335                               Broff->data().get(),
4336                               Bnnz,
4337                               m,
4338                               Bcoo->data().get(),
4339                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4340       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
4341       auto Aperm = thrust::make_constant_iterator(1);
4342       auto Bperm = thrust::make_constant_iterator(0);
4343 #if PETSC_PKG_CUDA_VERSION_GE(10,0,0)
4344       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n));
4345       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n));
4346 #else
4347       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
4348       auto Bcib = Bcsr->column_indices->begin();
4349       auto Bcie = Bcsr->column_indices->end();
4350       thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n));
4351 #endif
4352       auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz);
4353       auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm));
4354       auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm));
4355       auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm));
4356       auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm));
4357       auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin()));
4358       auto p1 = Ccusp->cooPerm->begin();
4359       auto p2 = Ccusp->cooPerm->begin();
4360       thrust::advance(p2,Annz);
4361       PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4()));
4362 #if PETSC_PKG_CUDA_VERSION_LT(10,0,0)
4363       thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n));
4364 #endif
4365       auto cci = thrust::make_counting_iterator(zero);
4366       auto cce = thrust::make_counting_iterator(c->nz);
4367 #if 0 //Errors on SUMMIT cuda 11.1.0
4368       PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
4369 #else
4370       auto pred = thrust::identity<int>();
4371       PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred));
4372       PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred));
4373 #endif
4374       stat = cusparseXcoo2csr(Ccusp->handle,
4375                               Ccoo->data().get(),
4376                               c->nz,
4377                               m,
4378                               Ccsr->row_offsets->data().get(),
4379                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4380       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4381       delete wPerm;
4382       delete Acoo;
4383       delete Bcoo;
4384       delete Ccoo;
4385 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4386       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries,
4387                                Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(),
4388                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4389                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4390 #endif
4391       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
4392         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4393         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4394         CsrMatrix *CcsrT = new CsrMatrix;
4395         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4396         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4397 
4398         (*C)->form_explicit_transpose = PETSC_TRUE;
4399         (*C)->transupdated = PETSC_TRUE;
4400         Ccusp->rowoffsets_gpu = NULL;
4401         CmatT->cprowIndices = NULL;
4402         CmatT->mat = CcsrT;
4403         CcsrT->num_rows = n;
4404         CcsrT->num_cols = m;
4405         CcsrT->num_entries = c->nz;
4406 
4407         CcsrT->row_offsets = new THRUSTINTARRAY32(n+1);
4408         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4409         CcsrT->values = new THRUSTARRAY(c->nz);
4410 
4411         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4412         auto rT = CcsrT->row_offsets->begin();
4413         if (AT) {
4414           rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT);
4415           thrust::advance(rT,-1);
4416         }
4417         if (BT) {
4418           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz));
4419           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz));
4420           thrust::copy(titb,tite,rT);
4421         }
4422         auto cT = CcsrT->column_indices->begin();
4423         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT);
4424         if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT);
4425         auto vT = CcsrT->values->begin();
4426         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4427         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4428         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4429 
4430         stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat);
4431         stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4432         stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
4433         cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
4434         cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
4435         cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
4436         cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4437         cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4438         cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4439 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4440         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries,
4441                                  CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(),
4442                                  CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4443                                  CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4444 #endif
4445         Ccusp->matTranspose = CmatT;
4446       }
4447     }
4448 
4449     c->singlemalloc = PETSC_FALSE;
4450     c->free_a       = PETSC_TRUE;
4451     c->free_ij      = PETSC_TRUE;
4452     ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
4453     ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
4454     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
4455       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4456       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4457       ii   = *Ccsr->row_offsets;
4458       jj   = *Ccsr->column_indices;
4459       cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4460       cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4461     } else {
4462       cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4463       cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4464     }
4465     ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
4466     ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
4467     ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
4468     c->maxnz = c->nz;
4469     c->nonzerorowcnt = 0;
4470     c->rmax = 0;
4471     for (i = 0; i < m; i++) {
4472       const PetscInt nn = c->i[i+1] - c->i[i];
4473       c->ilen[i] = c->imax[i] = nn;
4474       c->nonzerorowcnt += (PetscInt)!!nn;
4475       c->rmax = PetscMax(c->rmax,nn);
4476     }
4477     ierr = MatMarkDiagonal_SeqAIJ(*C);CHKERRQ(ierr);
4478     ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
4479     (*C)->nonzerostate++;
4480     ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr);
4481     ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr);
4482     Ccusp->nonzerostate = (*C)->nonzerostate;
4483     (*C)->preallocated  = PETSC_TRUE;
4484   } else {
4485     if ((*C)->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",(*C)->rmap->n,B->rmap->n);
4486     c = (Mat_SeqAIJ*)(*C)->data;
4487     if (c->nz) {
4488       Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4489       if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm");
4490       if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4491       if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate");
4492       ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4493       ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
4494       if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4495       if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4496       Acsr = (CsrMatrix*)Acusp->mat->mat;
4497       Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4498       Ccsr = (CsrMatrix*)Ccusp->mat->mat;
4499       if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %D != %D",Acsr->num_entries,(PetscInt)Acsr->values->size());
4500       if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %D != %D",Bcsr->num_entries,(PetscInt)Bcsr->values->size());
4501       if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D",Ccsr->num_entries,(PetscInt)Ccsr->values->size());
4502       if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D + %D",Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries);
4503       if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %D != %D",(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size());
4504       auto pmid = Ccusp->cooPerm->begin();
4505       thrust::advance(pmid,Acsr->num_entries);
4506       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4507       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(),
4508                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin())));
4509       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(),
4510                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4511       thrust::for_each(zibait,zieait,VecCUDAEquals());
4512       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(),
4513                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4514       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(),
4515                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end())));
4516       thrust::for_each(zibbit,ziebit,VecCUDAEquals());
4517       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE);CHKERRQ(ierr);
4518       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
4519         if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4520         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4521         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4522         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4523         CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat;
4524         auto vT = CcsrT->values->begin();
4525         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4526         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4527         (*C)->transupdated = PETSC_TRUE;
4528       }
4529       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4530     }
4531   }
4532   ierr = PetscObjectStateIncrease((PetscObject)*C);CHKERRQ(ierr);
4533   (*C)->assembled     = PETSC_TRUE;
4534   (*C)->was_assembled = PETSC_FALSE;
4535   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4536   PetscFunctionReturn(0);
4537 }
4538 
4539 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4540 {
4541   PetscErrorCode    ierr;
4542   bool              dmem;
4543   const PetscScalar *av;
4544   cudaError_t       cerr;
4545 
4546   PetscFunctionBegin;
4547   dmem = isCudaMem(v);
4548   ierr = MatSeqAIJCUSPARSEGetArrayRead(A,&av);CHKERRQ(ierr);
4549   if (n && idx) {
4550     THRUSTINTARRAY widx(n);
4551     widx.assign(idx,idx+n);
4552     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
4553 
4554     THRUSTARRAY *w = NULL;
4555     thrust::device_ptr<PetscScalar> dv;
4556     if (dmem) {
4557       dv = thrust::device_pointer_cast(v);
4558     } else {
4559       w = new THRUSTARRAY(n);
4560       dv = w->data();
4561     }
4562     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
4563 
4564     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv));
4565     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n));
4566     thrust::for_each(zibit,zieit,VecCUDAEquals());
4567     if (w) {
4568       cerr = cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4569     }
4570     delete w;
4571   } else {
4572     cerr = cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4573   }
4574   if (!dmem) { ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); }
4575   ierr = MatSeqAIJCUSPARSERestoreArrayRead(A,&av);CHKERRQ(ierr);
4576   PetscFunctionReturn(0);
4577 }
4578