xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision dfd57a172ac9fa6c7b5fe6de6ab5df85cefc2996)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
7 
8 #include <petscconf.h>
9 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
10 #include <../src/mat/impls/sbaij/seq/sbaij.h>
11 #include <../src/vec/vec/impls/dvecimpl.h>
12 #include <petsc/private/vecimpl.h>
13 #undef VecType
14 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
15 #include <thrust/async/for_each.h>
16 
17 const char *const MatCUSPARSEStorageFormats[]    = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
18 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
19   /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
20     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
21 
22   typedef enum {
23       CUSPARSE_MV_ALG_DEFAULT = 0,
24       CUSPARSE_COOMV_ALG      = 1,
25       CUSPARSE_CSRMV_ALG1     = 2,
26       CUSPARSE_CSRMV_ALG2     = 3
27   } cusparseSpMVAlg_t;
28 
29   typedef enum {
30       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
31       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
32       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
33       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
34       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
35       CUSPARSE_SPMM_ALG_DEFAULT = 0,
36       CUSPARSE_SPMM_COO_ALG1    = 1,
37       CUSPARSE_SPMM_COO_ALG2    = 2,
38       CUSPARSE_SPMM_COO_ALG3    = 3,
39       CUSPARSE_SPMM_COO_ALG4    = 5,
40       CUSPARSE_SPMM_CSR_ALG1    = 4,
41       CUSPARSE_SPMM_CSR_ALG2    = 6,
42   } cusparseSpMMAlg_t;
43 
44   typedef enum {
45       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
46       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
47   } cusparseCsr2CscAlg_t;
48   */
49   const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
50   const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
51   const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
52 #endif
53 
54 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
55 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
56 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
57 
58 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
59 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
60 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
61 
62 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
63 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
64 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
65 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
66 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
67 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure);
68 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar);
69 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
70 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
71 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
72 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
73 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
74 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
75 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
76 
77 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
78 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
79 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
80 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
81 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
82 
83 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat);
84 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
85 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool);
86 
87 PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]);
88 PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode);
89 
90 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]);
91 
92 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
93 {
94   cusparseStatus_t   stat;
95   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
96 
97   PetscFunctionBegin;
98   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
99   cusparsestruct->stream = stream;
100   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
101   PetscFunctionReturn(0);
102 }
103 
104 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
105 {
106   cusparseStatus_t   stat;
107   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
108 
109   PetscFunctionBegin;
110   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
111   if (cusparsestruct->handle != handle) {
112     if (cusparsestruct->handle) {
113       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
114     }
115     cusparsestruct->handle = handle;
116   }
117   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
118   PetscFunctionReturn(0);
119 }
120 
121 PetscErrorCode MatCUSPARSEClearHandle(Mat A)
122 {
123   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
124   PetscBool          flg;
125   PetscErrorCode     ierr;
126 
127   PetscFunctionBegin;
128   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
129   if (!flg || !cusparsestruct) PetscFunctionReturn(0);
130   if (cusparsestruct->handle) cusparsestruct->handle = 0;
131   PetscFunctionReturn(0);
132 }
133 
134 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
135 {
136   PetscFunctionBegin;
137   *type = MATSOLVERCUSPARSE;
138   PetscFunctionReturn(0);
139 }
140 
141 /*MC
142   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
143   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
144   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
145   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
146   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
147   algorithms are not recommended. This class does NOT support direct solver operations.
148 
149   Level: beginner
150 
151 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
152 M*/
153 
154 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
155 {
156   PetscErrorCode ierr;
157   PetscInt       n = A->rmap->n;
158 
159   PetscFunctionBegin;
160   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
161   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
162   (*B)->factortype = ftype;
163   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
164 
165   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
166     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
167     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
168     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
169     ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr);
170     ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU]);CHKERRQ(ierr);
171     ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT]);CHKERRQ(ierr);
172   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
173     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
174     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
175     ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]);CHKERRQ(ierr);
176     ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC]);CHKERRQ(ierr);
177   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
178 
179   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
180   (*B)->canuseordering = PETSC_TRUE;
181   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
182   PetscFunctionReturn(0);
183 }
184 
185 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
186 {
187   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
188 
189   PetscFunctionBegin;
190   switch (op) {
191   case MAT_CUSPARSE_MULT:
192     cusparsestruct->format = format;
193     break;
194   case MAT_CUSPARSE_ALL:
195     cusparsestruct->format = format;
196     break;
197   default:
198     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
199   }
200   PetscFunctionReturn(0);
201 }
202 
203 /*@
204    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
205    operation. Only the MatMult operation can use different GPU storage formats
206    for MPIAIJCUSPARSE matrices.
207    Not Collective
208 
209    Input Parameters:
210 +  A - Matrix of type SEQAIJCUSPARSE
211 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
212 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
213 
214    Output Parameter:
215 
216    Level: intermediate
217 
218 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
219 @*/
220 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
221 {
222   PetscErrorCode ierr;
223 
224   PetscFunctionBegin;
225   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
226   ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
227   PetscFunctionReturn(0);
228 }
229 
230 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg)
231 {
232   PetscErrorCode ierr;
233 
234   PetscFunctionBegin;
235   switch (op) {
236     case MAT_FORM_EXPLICIT_TRANSPOSE:
237       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
238       if (A->form_explicit_transpose && !flg) {ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);}
239       A->form_explicit_transpose = flg;
240       break;
241     default:
242       ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr);
243       break;
244   }
245   PetscFunctionReturn(0);
246 }
247 
248 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A);
249 
250 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
251 {
252   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
253   IS             isrow = b->row,iscol = b->col;
254   PetscBool      row_identity,col_identity;
255   PetscErrorCode ierr;
256 
257   PetscFunctionBegin;
258   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
259   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
260   B->offloadmask = PETSC_OFFLOAD_CPU;
261   /* determine which version of MatSolve needs to be used. */
262   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
263   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
264   if (row_identity && col_identity) {
265     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
266     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
267     B->ops->matsolve = NULL;
268     B->ops->matsolvetranspose = NULL;
269   } else {
270     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
271     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
272     B->ops->matsolve = NULL;
273     B->ops->matsolvetranspose = NULL;
274   }
275 
276   /* get the triangular factors */
277   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
278   PetscFunctionReturn(0);
279 }
280 
281 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
282 {
283   PetscErrorCode           ierr;
284   MatCUSPARSEStorageFormat format;
285   PetscBool                flg;
286   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
287 
288   PetscFunctionBegin;
289   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
290   if (A->factortype == MAT_FACTOR_NONE) {
291     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
292                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
293     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);}
294 
295     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
296                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
297     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);}
298    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
299     ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
300                             "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr);
301     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
302     if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
303 
304     ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
305                             "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr);
306     if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
307 
308     ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
309                             "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr);
310     if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
311    #endif
312   }
313   ierr = PetscOptionsTail();CHKERRQ(ierr);
314   PetscFunctionReturn(0);
315 }
316 
317 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
318 {
319   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
320   PetscErrorCode               ierr;
321 
322   PetscFunctionBegin;
323   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
324   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
325   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
326   PetscFunctionReturn(0);
327 }
328 
329 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
330 {
331   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
332   PetscErrorCode               ierr;
333 
334   PetscFunctionBegin;
335   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
336   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
337   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
338   PetscFunctionReturn(0);
339 }
340 
341 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
342 {
343   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
344   PetscErrorCode               ierr;
345 
346   PetscFunctionBegin;
347   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
348   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
349   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
350   PetscFunctionReturn(0);
351 }
352 
353 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
354 {
355   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
356   PetscErrorCode               ierr;
357 
358   PetscFunctionBegin;
359   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
360   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
361   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
362   PetscFunctionReturn(0);
363 }
364 
365 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
366 {
367   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
368   PetscInt                          n = A->rmap->n;
369   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
370   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
371   cusparseStatus_t                  stat;
372   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
373   const MatScalar                   *aa = a->a,*v;
374   PetscInt                          *AiLo, *AjLo;
375   PetscInt                          i,nz, nzLower, offset, rowOffset;
376   PetscErrorCode                    ierr;
377   cudaError_t                       cerr;
378 
379   PetscFunctionBegin;
380   if (!n) PetscFunctionReturn(0);
381   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
382     try {
383       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
384       nzLower=n+ai[n]-ai[1];
385       if (!loTriFactor) {
386         PetscScalar                       *AALo;
387 
388         cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
389 
390         /* Allocate Space for the lower triangular matrix */
391         cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
392         cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);
393 
394         /* Fill the lower triangular matrix */
395         AiLo[0]  = (PetscInt) 0;
396         AiLo[n]  = nzLower;
397         AjLo[0]  = (PetscInt) 0;
398         AALo[0]  = (MatScalar) 1.0;
399         v        = aa;
400         vi       = aj;
401         offset   = 1;
402         rowOffset= 1;
403         for (i=1; i<n; i++) {
404           nz = ai[i+1] - ai[i];
405           /* additional 1 for the term on the diagonal */
406           AiLo[i]    = rowOffset;
407           rowOffset += nz+1;
408 
409           ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
410           ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
411 
412           offset      += nz;
413           AjLo[offset] = (PetscInt) i;
414           AALo[offset] = (MatScalar) 1.0;
415           offset      += 1;
416 
417           v  += nz;
418           vi += nz;
419         }
420 
421         /* allocate space for the triangular factor information */
422         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
423         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
424         /* Create the matrix description */
425         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
426         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
427        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
428         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
429        #else
430         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
431        #endif
432         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
433         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
434 
435         /* set the operation */
436         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
437 
438         /* set the matrix */
439         loTriFactor->csrMat = new CsrMatrix;
440         loTriFactor->csrMat->num_rows = n;
441         loTriFactor->csrMat->num_cols = n;
442         loTriFactor->csrMat->num_entries = nzLower;
443 
444         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
445         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
446 
447         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
448         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
449 
450         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
451         loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
452 
453         /* Create the solve analysis information */
454         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
455         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
456       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
457         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
458                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
459                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
460                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
461                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
462         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
463       #endif
464 
465         /* perform the solve analysis */
466         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
467                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
468                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
469                                  loTriFactor->csrMat->column_indices->data().get(),
470                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
471                                  loTriFactor->solveInfo,
472                                  loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
473                                #else
474                                  loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
475                                #endif
476         cerr = WaitForCUDA();CHKERRCUDA(cerr);
477         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
478 
479         /* assign the pointer */
480         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
481         loTriFactor->AA_h = AALo;
482         cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
483         cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
484         ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
485       } else { /* update values only */
486         if (!loTriFactor->AA_h) {
487           cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
488         }
489         /* Fill the lower triangular matrix */
490         loTriFactor->AA_h[0]  = 1.0;
491         v        = aa;
492         vi       = aj;
493         offset   = 1;
494         for (i=1; i<n; i++) {
495           nz = ai[i+1] - ai[i];
496           ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr);
497           offset      += nz;
498           loTriFactor->AA_h[offset] = 1.0;
499           offset      += 1;
500           v  += nz;
501         }
502         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower);
503         ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
504       }
505     } catch(char *ex) {
506       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
507     }
508   }
509   PetscFunctionReturn(0);
510 }
511 
512 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
513 {
514   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
515   PetscInt                          n = A->rmap->n;
516   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
517   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
518   cusparseStatus_t                  stat;
519   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
520   const MatScalar                   *aa = a->a,*v;
521   PetscInt                          *AiUp, *AjUp;
522   PetscInt                          i,nz, nzUpper, offset;
523   PetscErrorCode                    ierr;
524   cudaError_t                       cerr;
525 
526   PetscFunctionBegin;
527   if (!n) PetscFunctionReturn(0);
528   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
529     try {
530       /* next, figure out the number of nonzeros in the upper triangular matrix. */
531       nzUpper = adiag[0]-adiag[n];
532       if (!upTriFactor) {
533         PetscScalar *AAUp;
534 
535         cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
536 
537         /* Allocate Space for the upper triangular matrix */
538         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
539         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
540 
541         /* Fill the upper triangular matrix */
542         AiUp[0]=(PetscInt) 0;
543         AiUp[n]=nzUpper;
544         offset = nzUpper;
545         for (i=n-1; i>=0; i--) {
546           v  = aa + adiag[i+1] + 1;
547           vi = aj + adiag[i+1] + 1;
548 
549           /* number of elements NOT on the diagonal */
550           nz = adiag[i] - adiag[i+1]-1;
551 
552           /* decrement the offset */
553           offset -= (nz+1);
554 
555           /* first, set the diagonal elements */
556           AjUp[offset] = (PetscInt) i;
557           AAUp[offset] = (MatScalar)1./v[nz];
558           AiUp[i]      = AiUp[i+1] - (nz+1);
559 
560           ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
561           ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
562         }
563 
564         /* allocate space for the triangular factor information */
565         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
566         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
567 
568         /* Create the matrix description */
569         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
570         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
571        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
572         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
573        #else
574         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
575        #endif
576         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
577         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
578 
579         /* set the operation */
580         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
581 
582         /* set the matrix */
583         upTriFactor->csrMat = new CsrMatrix;
584         upTriFactor->csrMat->num_rows = n;
585         upTriFactor->csrMat->num_cols = n;
586         upTriFactor->csrMat->num_entries = nzUpper;
587 
588         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
589         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
590 
591         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
592         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
593 
594         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
595         upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
596 
597         /* Create the solve analysis information */
598         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
599         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
600       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
601         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
602                                      upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
603                                      upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
604                                      upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
605                                      &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
606         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
607       #endif
608 
609         /* perform the solve analysis */
610         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
611                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
612                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
613                                  upTriFactor->csrMat->column_indices->data().get(),
614                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
615                                  upTriFactor->solveInfo,
616                                  upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
617                                #else
618                                  upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
619                                #endif
620         cerr = WaitForCUDA();CHKERRCUDA(cerr);
621         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
622 
623         /* assign the pointer */
624         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
625         upTriFactor->AA_h = AAUp;
626         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
627         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
628         ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
629       } else {
630         if (!upTriFactor->AA_h) {
631           cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
632         }
633         /* Fill the upper triangular matrix */
634         offset = nzUpper;
635         for (i=n-1; i>=0; i--) {
636           v  = aa + adiag[i+1] + 1;
637 
638           /* number of elements NOT on the diagonal */
639           nz = adiag[i] - adiag[i+1]-1;
640 
641           /* decrement the offset */
642           offset -= (nz+1);
643 
644           /* first, set the diagonal elements */
645           upTriFactor->AA_h[offset] = 1./v[nz];
646           ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr);
647         }
648         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper);
649         ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
650       }
651     } catch(char *ex) {
652       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
653     }
654   }
655   PetscFunctionReturn(0);
656 }
657 
658 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
659 {
660   PetscErrorCode               ierr;
661   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
662   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
663   IS                           isrow = a->row,iscol = a->icol;
664   PetscBool                    row_identity,col_identity;
665   PetscInt                     n = A->rmap->n;
666 
667   PetscFunctionBegin;
668   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
669   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
670   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
671 
672   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
673   cusparseTriFactors->nnz=a->nz;
674 
675   A->offloadmask = PETSC_OFFLOAD_BOTH;
676   /* lower triangular indices */
677   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
678   if (!row_identity && !cusparseTriFactors->rpermIndices) {
679     const PetscInt *r;
680 
681     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
682     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
683     cusparseTriFactors->rpermIndices->assign(r, r+n);
684     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
685     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
686   }
687 
688   /* upper triangular indices */
689   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
690   if (!col_identity && !cusparseTriFactors->cpermIndices) {
691     const PetscInt *c;
692 
693     ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
694     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
695     cusparseTriFactors->cpermIndices->assign(c, c+n);
696     ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
697     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
698   }
699   PetscFunctionReturn(0);
700 }
701 
702 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
703 {
704   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
705   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
706   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
707   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
708   cusparseStatus_t                  stat;
709   PetscErrorCode                    ierr;
710   cudaError_t                       cerr;
711   PetscInt                          *AiUp, *AjUp;
712   PetscScalar                       *AAUp;
713   PetscScalar                       *AALo;
714   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
715   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
716   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
717   const MatScalar                   *aa = b->a,*v;
718 
719   PetscFunctionBegin;
720   if (!n) PetscFunctionReturn(0);
721   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
722     try {
723       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
724       cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
725       if (!upTriFactor && !loTriFactor) {
726         /* Allocate Space for the upper triangular matrix */
727         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
728         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
729 
730         /* Fill the upper triangular matrix */
731         AiUp[0]=(PetscInt) 0;
732         AiUp[n]=nzUpper;
733         offset = 0;
734         for (i=0; i<n; i++) {
735           /* set the pointers */
736           v  = aa + ai[i];
737           vj = aj + ai[i];
738           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
739 
740           /* first, set the diagonal elements */
741           AjUp[offset] = (PetscInt) i;
742           AAUp[offset] = (MatScalar)1.0/v[nz];
743           AiUp[i]      = offset;
744           AALo[offset] = (MatScalar)1.0/v[nz];
745 
746           offset+=1;
747           if (nz>0) {
748             ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
749             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
750             for (j=offset; j<offset+nz; j++) {
751               AAUp[j] = -AAUp[j];
752               AALo[j] = AAUp[j]/v[nz];
753             }
754             offset+=nz;
755           }
756         }
757 
758         /* allocate space for the triangular factor information */
759         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
760         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
761 
762         /* Create the matrix description */
763         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
764         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
765        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
766         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
767        #else
768         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
769        #endif
770         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
771         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
772 
773         /* set the matrix */
774         upTriFactor->csrMat = new CsrMatrix;
775         upTriFactor->csrMat->num_rows = A->rmap->n;
776         upTriFactor->csrMat->num_cols = A->cmap->n;
777         upTriFactor->csrMat->num_entries = a->nz;
778 
779         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
780         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
781 
782         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
783         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
784 
785         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
786         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
787 
788         /* set the operation */
789         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
790 
791         /* Create the solve analysis information */
792         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
793         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
794       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
795         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
796                                        upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
797                                        upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
798                                        upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
799                                        &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
800         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
801       #endif
802 
803         /* perform the solve analysis */
804         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
805                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
806                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
807                                  upTriFactor->csrMat->column_indices->data().get(),
808                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
809                                  upTriFactor->solveInfo,
810                                  upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
811                                 #else
812                                   upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
813                                 #endif
814         cerr = WaitForCUDA();CHKERRCUDA(cerr);
815         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
816 
817         /* assign the pointer */
818         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
819 
820         /* allocate space for the triangular factor information */
821         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
822         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
823 
824         /* Create the matrix description */
825         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
826         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
827        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
828         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
829        #else
830         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
831        #endif
832         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
833         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
834 
835         /* set the operation */
836         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
837 
838         /* set the matrix */
839         loTriFactor->csrMat = new CsrMatrix;
840         loTriFactor->csrMat->num_rows = A->rmap->n;
841         loTriFactor->csrMat->num_cols = A->cmap->n;
842         loTriFactor->csrMat->num_entries = a->nz;
843 
844         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
845         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
846 
847         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
848         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
849 
850         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
851         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
852 
853         /* Create the solve analysis information */
854         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
855         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
856       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
857         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
858                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
859                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
860                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
861                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
862         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
863       #endif
864 
865         /* perform the solve analysis */
866         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
867                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
868                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
869                                  loTriFactor->csrMat->column_indices->data().get(),
870                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
871                                  loTriFactor->solveInfo,
872                                  loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
873                                 #else
874                                  loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
875                                 #endif
876         cerr = WaitForCUDA();CHKERRCUDA(cerr);
877         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
878 
879         /* assign the pointer */
880         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
881 
882         ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
883         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
884         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
885       } else {
886         /* Fill the upper triangular matrix */
887         offset = 0;
888         for (i=0; i<n; i++) {
889           /* set the pointers */
890           v  = aa + ai[i];
891           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
892 
893           /* first, set the diagonal elements */
894           AAUp[offset] = 1.0/v[nz];
895           AALo[offset] = 1.0/v[nz];
896 
897           offset+=1;
898           if (nz>0) {
899             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
900             for (j=offset; j<offset+nz; j++) {
901               AAUp[j] = -AAUp[j];
902               AALo[j] = AAUp[j]/v[nz];
903             }
904             offset+=nz;
905           }
906         }
907         if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
908         if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
909         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
910         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
911         ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
912       }
913       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
914       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
915     } catch(char *ex) {
916       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
917     }
918   }
919   PetscFunctionReturn(0);
920 }
921 
922 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
923 {
924   PetscErrorCode               ierr;
925   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
926   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
927   IS                           ip = a->row;
928   PetscBool                    perm_identity;
929   PetscInt                     n = A->rmap->n;
930 
931   PetscFunctionBegin;
932   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
933   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
934   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
935   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
936 
937   A->offloadmask = PETSC_OFFLOAD_BOTH;
938 
939   /* lower triangular indices */
940   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
941   if (!perm_identity) {
942     IS             iip;
943     const PetscInt *irip,*rip;
944 
945     ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr);
946     ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr);
947     ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
948     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
949     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
950     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
951     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
952     ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr);
953     ierr = ISDestroy(&iip);CHKERRQ(ierr);
954     ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
955     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
956   }
957   PetscFunctionReturn(0);
958 }
959 
960 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
961 {
962   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
963   IS             ip = b->row;
964   PetscBool      perm_identity;
965   PetscErrorCode ierr;
966 
967   PetscFunctionBegin;
968   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
969   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
970   B->offloadmask = PETSC_OFFLOAD_CPU;
971   /* determine which version of MatSolve needs to be used. */
972   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
973   if (perm_identity) {
974     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
975     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
976     B->ops->matsolve = NULL;
977     B->ops->matsolvetranspose = NULL;
978   } else {
979     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
980     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
981     B->ops->matsolve = NULL;
982     B->ops->matsolvetranspose = NULL;
983   }
984 
985   /* get the triangular factors */
986   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
987   PetscFunctionReturn(0);
988 }
989 
990 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
991 {
992   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
993   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
994   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
995   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
996   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
997   cusparseStatus_t                  stat;
998   cusparseIndexBase_t               indexBase;
999   cusparseMatrixType_t              matrixType;
1000   cusparseFillMode_t                fillMode;
1001   cusparseDiagType_t                diagType;
1002   cudaError_t                       cerr;
1003   PetscErrorCode                    ierr;
1004 
1005   PetscFunctionBegin;
1006   /* allocate space for the transpose of the lower triangular factor */
1007   ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr);
1008   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1009 
1010   /* set the matrix descriptors of the lower triangular factor */
1011   matrixType = cusparseGetMatType(loTriFactor->descr);
1012   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
1013   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1014     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1015   diagType = cusparseGetMatDiagType(loTriFactor->descr);
1016 
1017   /* Create the matrix description */
1018   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
1019   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1020   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1021   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1022   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1023 
1024   /* set the operation */
1025   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1026 
1027   /* allocate GPU space for the CSC of the lower triangular factor*/
1028   loTriFactorT->csrMat = new CsrMatrix;
1029   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1030   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1031   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1032   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
1033   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1034   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1035 
1036   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1037 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1038   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1039                                        loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1040                                        loTriFactor->csrMat->values->data().get(),
1041                                        loTriFactor->csrMat->row_offsets->data().get(),
1042                                        loTriFactor->csrMat->column_indices->data().get(),
1043                                        loTriFactorT->csrMat->values->data().get(),
1044                                        loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1045                                        CUSPARSE_ACTION_NUMERIC,indexBase,
1046                                        CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1047   cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1048 #endif
1049 
1050   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1051   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1052                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1053                           loTriFactor->csrMat->values->data().get(),
1054                           loTriFactor->csrMat->row_offsets->data().get(),
1055                           loTriFactor->csrMat->column_indices->data().get(),
1056                           loTriFactorT->csrMat->values->data().get(),
1057                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1058                           loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1059                           CUSPARSE_ACTION_NUMERIC, indexBase,
1060                           CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat);
1061                         #else
1062                           loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1063                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1064                         #endif
1065   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1066   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1067 
1068   /* Create the solve analysis information */
1069   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1070   stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1071 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1072   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
1073                                 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1074                                 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1075                                 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
1076                                 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1077   cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1078 #endif
1079 
1080   /* perform the solve analysis */
1081   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
1082                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1083                            loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1084                            loTriFactorT->csrMat->column_indices->data().get(),
1085                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1086                            loTriFactorT->solveInfo,
1087                            loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1088                           #else
1089                            loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1090                           #endif
1091   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1092   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1093 
1094   /* assign the pointer */
1095   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1096 
1097   /*********************************************/
1098   /* Now the Transpose of the Upper Tri Factor */
1099   /*********************************************/
1100 
1101   /* allocate space for the transpose of the upper triangular factor */
1102   ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr);
1103   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1104 
1105   /* set the matrix descriptors of the upper triangular factor */
1106   matrixType = cusparseGetMatType(upTriFactor->descr);
1107   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1108   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1109     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1110   diagType = cusparseGetMatDiagType(upTriFactor->descr);
1111 
1112   /* Create the matrix description */
1113   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
1114   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1115   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1116   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1117   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1118 
1119   /* set the operation */
1120   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1121 
1122   /* allocate GPU space for the CSC of the upper triangular factor*/
1123   upTriFactorT->csrMat = new CsrMatrix;
1124   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1125   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1126   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1127   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1128   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1129   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1130 
1131   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1132 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1133   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1134                                 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1135                                 upTriFactor->csrMat->values->data().get(),
1136                                 upTriFactor->csrMat->row_offsets->data().get(),
1137                                 upTriFactor->csrMat->column_indices->data().get(),
1138                                 upTriFactorT->csrMat->values->data().get(),
1139                                 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1140                                 CUSPARSE_ACTION_NUMERIC,indexBase,
1141                                 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1142   cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1143 #endif
1144 
1145   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1146   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1147                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1148                           upTriFactor->csrMat->values->data().get(),
1149                           upTriFactor->csrMat->row_offsets->data().get(),
1150                           upTriFactor->csrMat->column_indices->data().get(),
1151                           upTriFactorT->csrMat->values->data().get(),
1152                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1153                           upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1154                           CUSPARSE_ACTION_NUMERIC, indexBase,
1155                           CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat);
1156                         #else
1157                           upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1158                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1159                         #endif
1160 
1161   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1162   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1163 
1164   /* Create the solve analysis information */
1165   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1166   stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1167   #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1168   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1169                                  upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1170                                  upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1171                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
1172                                  &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1173   cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1174   #endif
1175 
1176   /* perform the solve analysis */
1177   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1178                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1179                            upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1180                            upTriFactorT->csrMat->column_indices->data().get(),
1181                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1182                            upTriFactorT->solveInfo,
1183                            upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1184                           #else
1185                            upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1186                           #endif
1187 
1188   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1189   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1190 
1191   /* assign the pointer */
1192   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1193   PetscFunctionReturn(0);
1194 }
1195 
1196 struct PetscScalarToPetscInt
1197 {
1198   __host__ __device__
1199   PetscInt operator()(PetscScalar s)
1200   {
1201     return (PetscInt)PetscRealPart(s);
1202   }
1203 };
1204 
1205 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTransposeForMult(Mat A)
1206 {
1207   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1208   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1209   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1210   cusparseStatus_t             stat;
1211   cusparseIndexBase_t          indexBase;
1212   cudaError_t                  err;
1213   PetscErrorCode               ierr;
1214 
1215   PetscFunctionBegin;
1216   if (!A->form_explicit_transpose || !A->rmap->n || !A->cmap->n) PetscFunctionReturn(0);
1217   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1218   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1219   if (!matstruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing mat struct");
1220   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1221   if (A->transupdated && !matstructT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing matTranspose struct");
1222   if (A->transupdated) PetscFunctionReturn(0);
1223   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1224   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1225   if (cusparsestruct->format != MAT_CUSPARSE_CSR) {
1226     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
1227   }
1228   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1229     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1230     stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
1231     indexBase = cusparseGetMatIndexBase(matstruct->descr);
1232     stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
1233     stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1234 
1235     /* set alpha and beta */
1236     err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1237     err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1238     err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1239     err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1240     err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1241     err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1242 
1243     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1244       CsrMatrix *matrixT = new CsrMatrix;
1245       matstructT->mat = matrixT;
1246       matrixT->num_rows = A->cmap->n;
1247       matrixT->num_cols = A->rmap->n;
1248       matrixT->num_entries = a->nz;
1249       matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1250       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1251       matrixT->values = new THRUSTARRAY(a->nz);
1252 
1253       if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); }
1254       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
1255 
1256      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1257       stat = cusparseCreateCsr(&matstructT->matDescr,
1258                                matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1259                                matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1260                                matrixT->values->data().get(),
1261                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1262                                indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
1263      #endif
1264     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1265    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1266       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1267    #else
1268       CsrMatrix *temp  = new CsrMatrix;
1269       CsrMatrix *tempT = new CsrMatrix;
1270       /* First convert HYB to CSR */
1271       temp->num_rows = A->rmap->n;
1272       temp->num_cols = A->cmap->n;
1273       temp->num_entries = a->nz;
1274       temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1275       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1276       temp->values = new THRUSTARRAY(a->nz);
1277 
1278       stat = cusparse_hyb2csr(cusparsestruct->handle,
1279                               matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1280                               temp->values->data().get(),
1281                               temp->row_offsets->data().get(),
1282                               temp->column_indices->data().get());CHKERRCUSPARSE(stat);
1283 
1284       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1285       tempT->num_rows = A->rmap->n;
1286       tempT->num_cols = A->cmap->n;
1287       tempT->num_entries = a->nz;
1288       tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1289       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1290       tempT->values = new THRUSTARRAY(a->nz);
1291 
1292       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1293                               temp->num_cols, temp->num_entries,
1294                               temp->values->data().get(),
1295                               temp->row_offsets->data().get(),
1296                               temp->column_indices->data().get(),
1297                               tempT->values->data().get(),
1298                               tempT->column_indices->data().get(),
1299                               tempT->row_offsets->data().get(),
1300                               CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1301 
1302       /* Last, convert CSC to HYB */
1303       cusparseHybMat_t hybMat;
1304       stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1305       cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1306         CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1307       stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1308                               matstructT->descr, tempT->values->data().get(),
1309                               tempT->row_offsets->data().get(),
1310                               tempT->column_indices->data().get(),
1311                               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1312 
1313       /* assign the pointer */
1314       matstructT->mat = hybMat;
1315       A->transupdated = PETSC_TRUE;
1316       /* delete temporaries */
1317       if (tempT) {
1318         if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1319         if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1320         if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1321         delete (CsrMatrix*) tempT;
1322       }
1323       if (temp) {
1324         if (temp->values) delete (THRUSTARRAY*) temp->values;
1325         if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1326         if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1327         delete (CsrMatrix*) temp;
1328       }
1329      #endif
1330     }
1331   }
1332   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1333     CsrMatrix *matrix  = (CsrMatrix*)matstruct->mat;
1334     CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat;
1335     if (!matrix) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix");
1336     if (!matrix->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix rows");
1337     if (!matrix->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix cols");
1338     if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix values");
1339     if (!matrixT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT");
1340     if (!matrixT->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT rows");
1341     if (!matrixT->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT cols");
1342     if (!matrixT->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT values");
1343     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1344       cusparsestruct->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
1345       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
1346       ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
1347     }
1348     if (!cusparsestruct->csr2csc_i) {
1349       THRUSTARRAY csr2csc_a(matrix->num_entries);
1350       PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1351 
1352       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1353      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1354       void   *csr2cscBuffer;
1355       size_t csr2cscBufferSize;
1356       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1357                                            A->cmap->n, matrix->num_entries,
1358                                            matrix->values->data().get(),
1359                                            cusparsestruct->rowoffsets_gpu->data().get(),
1360                                            matrix->column_indices->data().get(),
1361                                            matrixT->values->data().get(),
1362                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1363                                            CUSPARSE_ACTION_NUMERIC,indexBase,
1364                                            cusparsestruct->csr2cscAlg, &csr2cscBufferSize);CHKERRCUSPARSE(stat);
1365       err = cudaMalloc(&csr2cscBuffer,csr2cscBufferSize);CHKERRCUDA(err);
1366      #endif
1367 
1368       if (matrix->num_entries) {
1369         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
1370            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
1371            I checked every parameters and they were just fine. I have no clue why cusparse complains.
1372 
1373            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
1374            should be filled with indexBase. So I just take a shortcut here.
1375         */
1376         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1377                               A->cmap->n,matrix->num_entries,
1378                               csr2csc_a.data().get(),
1379                               cusparsestruct->rowoffsets_gpu->data().get(),
1380                               matrix->column_indices->data().get(),
1381                               matrixT->values->data().get(),
1382                              #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1383                               matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1384                               CUSPARSE_ACTION_NUMERIC,indexBase,
1385                               cusparsestruct->csr2cscAlg, csr2cscBuffer);CHKERRCUSPARSE(stat);
1386                              #else
1387                               matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1388                               CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1389                              #endif
1390       } else {
1391         matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
1392       }
1393 
1394       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1395       PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt()));
1396      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1397       err = cudaFree(csr2cscBuffer);CHKERRCUDA(err);
1398      #endif
1399     }
1400     PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()),
1401                                                      thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()),
1402                                                      matrixT->values->begin()));
1403   }
1404   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1405   ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1406   /* the compressed row indices is not used for matTranspose */
1407   matstructT->cprowIndices = NULL;
1408   /* assign the pointer */
1409   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1410   A->transupdated = PETSC_TRUE;
1411   PetscFunctionReturn(0);
1412 }
1413 
1414 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1415 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1416 {
1417   PetscInt                              n = xx->map->n;
1418   const PetscScalar                     *barray;
1419   PetscScalar                           *xarray;
1420   thrust::device_ptr<const PetscScalar> bGPU;
1421   thrust::device_ptr<PetscScalar>       xGPU;
1422   cusparseStatus_t                      stat;
1423   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1424   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1425   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1426   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1427   PetscErrorCode                        ierr;
1428 
1429   PetscFunctionBegin;
1430   /* Analyze the matrix and create the transpose ... on the fly */
1431   if (!loTriFactorT && !upTriFactorT) {
1432     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1433     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1434     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1435   }
1436 
1437   /* Get the GPU pointers */
1438   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1439   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1440   xGPU = thrust::device_pointer_cast(xarray);
1441   bGPU = thrust::device_pointer_cast(barray);
1442 
1443   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1444   /* First, reorder with the row permutation */
1445   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1446                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1447                xGPU);
1448 
1449   /* First, solve U */
1450   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1451                         upTriFactorT->csrMat->num_rows,
1452                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1453                         upTriFactorT->csrMat->num_entries,
1454                       #endif
1455                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1456                         upTriFactorT->csrMat->values->data().get(),
1457                         upTriFactorT->csrMat->row_offsets->data().get(),
1458                         upTriFactorT->csrMat->column_indices->data().get(),
1459                         upTriFactorT->solveInfo,
1460                         xarray,
1461                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1462                         tempGPU->data().get(),
1463                         upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1464                       #else
1465                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1466                       #endif
1467 
1468   /* Then, solve L */
1469   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1470                         loTriFactorT->csrMat->num_rows,
1471                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1472                         loTriFactorT->csrMat->num_entries,
1473                       #endif
1474                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1475                         loTriFactorT->csrMat->values->data().get(),
1476                         loTriFactorT->csrMat->row_offsets->data().get(),
1477                         loTriFactorT->csrMat->column_indices->data().get(),
1478                         loTriFactorT->solveInfo,
1479                         tempGPU->data().get(),
1480                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1481                         xarray,
1482                         loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1483                       #else
1484                          xarray);CHKERRCUSPARSE(stat);
1485                       #endif
1486 
1487   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1488   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1489                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1490                tempGPU->begin());
1491 
1492   /* Copy the temporary to the full solution. */
1493   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU);
1494 
1495   /* restore */
1496   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1497   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1498   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1499   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1500   PetscFunctionReturn(0);
1501 }
1502 
1503 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1504 {
1505   const PetscScalar                 *barray;
1506   PetscScalar                       *xarray;
1507   cusparseStatus_t                  stat;
1508   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1509   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1510   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1511   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1512   PetscErrorCode                    ierr;
1513 
1514   PetscFunctionBegin;
1515   /* Analyze the matrix and create the transpose ... on the fly */
1516   if (!loTriFactorT && !upTriFactorT) {
1517     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1518     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1519     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1520   }
1521 
1522   /* Get the GPU pointers */
1523   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1524   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1525 
1526   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1527   /* First, solve U */
1528   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1529                         upTriFactorT->csrMat->num_rows,
1530                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1531                         upTriFactorT->csrMat->num_entries,
1532                       #endif
1533                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1534                         upTriFactorT->csrMat->values->data().get(),
1535                         upTriFactorT->csrMat->row_offsets->data().get(),
1536                         upTriFactorT->csrMat->column_indices->data().get(),
1537                         upTriFactorT->solveInfo,
1538                         barray,
1539                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1540                         tempGPU->data().get(),
1541                         upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1542                       #else
1543                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1544                       #endif
1545 
1546   /* Then, solve L */
1547   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1548                         loTriFactorT->csrMat->num_rows,
1549                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1550                         loTriFactorT->csrMat->num_entries,
1551                       #endif
1552                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1553                         loTriFactorT->csrMat->values->data().get(),
1554                         loTriFactorT->csrMat->row_offsets->data().get(),
1555                         loTriFactorT->csrMat->column_indices->data().get(),
1556                         loTriFactorT->solveInfo,
1557                         tempGPU->data().get(),
1558                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1559                         xarray,
1560                         loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1561                       #else
1562                         xarray);CHKERRCUSPARSE(stat);
1563                       #endif
1564 
1565   /* restore */
1566   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1567   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1568   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1569   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1570   PetscFunctionReturn(0);
1571 }
1572 
1573 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1574 {
1575   const PetscScalar                     *barray;
1576   PetscScalar                           *xarray;
1577   thrust::device_ptr<const PetscScalar> bGPU;
1578   thrust::device_ptr<PetscScalar>       xGPU;
1579   cusparseStatus_t                      stat;
1580   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1581   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1582   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1583   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1584   PetscErrorCode                        ierr;
1585 
1586   PetscFunctionBegin;
1587 
1588   /* Get the GPU pointers */
1589   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1590   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1591   xGPU = thrust::device_pointer_cast(xarray);
1592   bGPU = thrust::device_pointer_cast(barray);
1593 
1594   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1595   /* First, reorder with the row permutation */
1596   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1597                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1598                tempGPU->begin());
1599 
1600   /* Next, solve L */
1601   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1602                         loTriFactor->csrMat->num_rows,
1603                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1604                         loTriFactor->csrMat->num_entries,
1605                       #endif
1606                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1607                         loTriFactor->csrMat->values->data().get(),
1608                         loTriFactor->csrMat->row_offsets->data().get(),
1609                         loTriFactor->csrMat->column_indices->data().get(),
1610                         loTriFactor->solveInfo,
1611                         tempGPU->data().get(),
1612                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1613                          xarray,
1614                          loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1615                       #else
1616                          xarray);CHKERRCUSPARSE(stat);
1617                       #endif
1618 
1619   /* Then, solve U */
1620   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1621                         upTriFactor->csrMat->num_rows,
1622                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1623                         upTriFactor->csrMat->num_entries,
1624                       #endif
1625                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1626                         upTriFactor->csrMat->values->data().get(),
1627                         upTriFactor->csrMat->row_offsets->data().get(),
1628                         upTriFactor->csrMat->column_indices->data().get(),
1629                         upTriFactor->solveInfo,xarray,
1630                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1631                         tempGPU->data().get(),
1632                         upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1633                       #else
1634                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1635                       #endif
1636 
1637   /* Last, reorder with the column permutation */
1638   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1639                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
1640                xGPU);
1641 
1642   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1643   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1644   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1645   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1646   PetscFunctionReturn(0);
1647 }
1648 
1649 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1650 {
1651   const PetscScalar                 *barray;
1652   PetscScalar                       *xarray;
1653   cusparseStatus_t                  stat;
1654   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1655   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1656   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1657   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1658   PetscErrorCode                    ierr;
1659 
1660   PetscFunctionBegin;
1661   /* Get the GPU pointers */
1662   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1663   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1664 
1665   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1666   /* First, solve L */
1667   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1668                         loTriFactor->csrMat->num_rows,
1669                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1670                         loTriFactor->csrMat->num_entries,
1671                       #endif
1672                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1673                         loTriFactor->csrMat->values->data().get(),
1674                         loTriFactor->csrMat->row_offsets->data().get(),
1675                         loTriFactor->csrMat->column_indices->data().get(),
1676                         loTriFactor->solveInfo,
1677                         barray,
1678                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1679                         tempGPU->data().get(),
1680                         loTriFactor->solvePolicy,loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1681                       #else
1682                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1683                       #endif
1684 
1685   /* Next, solve U */
1686   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1687                         upTriFactor->csrMat->num_rows,
1688                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1689                         upTriFactor->csrMat->num_entries,
1690                       #endif
1691                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1692                         upTriFactor->csrMat->values->data().get(),
1693                         upTriFactor->csrMat->row_offsets->data().get(),
1694                         upTriFactor->csrMat->column_indices->data().get(),
1695                         upTriFactor->solveInfo,
1696                         tempGPU->data().get(),
1697                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1698                         xarray,
1699                         upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1700                       #else
1701                         xarray);CHKERRCUSPARSE(stat);
1702                       #endif
1703 
1704   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1705   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1706   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1707   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1708   PetscFunctionReturn(0);
1709 }
1710 
1711 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
1712 {
1713   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1714   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1715   cudaError_t        cerr;
1716   PetscErrorCode     ierr;
1717 
1718   PetscFunctionBegin;
1719   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
1720     CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat;
1721 
1722     ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
1723     cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1724     cerr = WaitForCUDA();CHKERRCUDA(cerr);
1725     ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr);
1726     ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
1727     A->offloadmask = PETSC_OFFLOAD_BOTH;
1728   }
1729   PetscFunctionReturn(0);
1730 }
1731 
1732 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1733 {
1734   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
1735   PetscErrorCode ierr;
1736 
1737   PetscFunctionBegin;
1738   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
1739   *array = a->a;
1740   A->offloadmask = PETSC_OFFLOAD_CPU;
1741   PetscFunctionReturn(0);
1742 }
1743 
1744 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1745 {
1746   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1747   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
1748   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1749   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
1750   PetscErrorCode               ierr;
1751   cusparseStatus_t             stat;
1752   PetscBool                    both = PETSC_TRUE;
1753   cudaError_t                  err;
1754 
1755   PetscFunctionBegin;
1756   if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Cannot copy to GPU");
1757   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1758     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
1759       CsrMatrix *matrix;
1760       matrix = (CsrMatrix*)cusparsestruct->mat->mat;
1761 
1762       if (a->nz && !a->a) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR values");
1763       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1764       matrix->values->assign(a->a, a->a+a->nz);
1765       err  = WaitForCUDA();CHKERRCUDA(err);
1766       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
1767       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1768       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
1769     } else {
1770       PetscInt nnz;
1771       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1772       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr);
1773       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
1774       delete cusparsestruct->workVector;
1775       delete cusparsestruct->rowoffsets_gpu;
1776       cusparsestruct->workVector = NULL;
1777       cusparsestruct->rowoffsets_gpu = NULL;
1778       try {
1779         if (a->compressedrow.use) {
1780           m    = a->compressedrow.nrows;
1781           ii   = a->compressedrow.i;
1782           ridx = a->compressedrow.rindex;
1783         } else {
1784           m    = A->rmap->n;
1785           ii   = a->i;
1786           ridx = NULL;
1787         }
1788         if (!ii) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR row data");
1789         if (m && !a->j) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR column data");
1790         if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; }
1791         else nnz = a->nz;
1792 
1793         /* create cusparse matrix */
1794         cusparsestruct->nrows = m;
1795         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1796         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
1797         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
1798         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1799 
1800         err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1801         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1802         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1803         err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1804         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1805         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1806         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1807 
1808         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1809         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1810           /* set the matrix */
1811           CsrMatrix *mat= new CsrMatrix;
1812           mat->num_rows = m;
1813           mat->num_cols = A->cmap->n;
1814           mat->num_entries = nnz;
1815           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1816           mat->row_offsets->assign(ii, ii + m+1);
1817 
1818           mat->column_indices = new THRUSTINTARRAY32(nnz);
1819           mat->column_indices->assign(a->j, a->j+nnz);
1820 
1821           mat->values = new THRUSTARRAY(nnz);
1822           if (a->a) mat->values->assign(a->a, a->a+nnz);
1823 
1824           /* assign the pointer */
1825           matstruct->mat = mat;
1826          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1827           if (mat->num_rows) { /* cusparse errors on empty matrices! */
1828             stat = cusparseCreateCsr(&matstruct->matDescr,
1829                                     mat->num_rows, mat->num_cols, mat->num_entries,
1830                                     mat->row_offsets->data().get(), mat->column_indices->data().get(),
1831                                     mat->values->data().get(),
1832                                     CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1833                                     CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
1834           }
1835          #endif
1836         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1837          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1838           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1839          #else
1840           CsrMatrix *mat= new CsrMatrix;
1841           mat->num_rows = m;
1842           mat->num_cols = A->cmap->n;
1843           mat->num_entries = nnz;
1844           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1845           mat->row_offsets->assign(ii, ii + m+1);
1846 
1847           mat->column_indices = new THRUSTINTARRAY32(nnz);
1848           mat->column_indices->assign(a->j, a->j+nnz);
1849 
1850           mat->values = new THRUSTARRAY(nnz);
1851           if (a->a) mat->values->assign(a->a, a->a+nnz);
1852 
1853           cusparseHybMat_t hybMat;
1854           stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1855           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1856             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1857           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
1858               matstruct->descr, mat->values->data().get(),
1859               mat->row_offsets->data().get(),
1860               mat->column_indices->data().get(),
1861               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1862           /* assign the pointer */
1863           matstruct->mat = hybMat;
1864 
1865           if (mat) {
1866             if (mat->values) delete (THRUSTARRAY*)mat->values;
1867             if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1868             if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1869             delete (CsrMatrix*)mat;
1870           }
1871          #endif
1872         }
1873 
1874         /* assign the compressed row indices */
1875         if (a->compressedrow.use) {
1876           cusparsestruct->workVector = new THRUSTARRAY(m);
1877           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
1878           matstruct->cprowIndices->assign(ridx,ridx+m);
1879           tmp = m;
1880         } else {
1881           cusparsestruct->workVector = NULL;
1882           matstruct->cprowIndices    = NULL;
1883           tmp = 0;
1884         }
1885         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
1886 
1887         /* assign the pointer */
1888         cusparsestruct->mat = matstruct;
1889       } catch(char *ex) {
1890         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1891       }
1892       err  = WaitForCUDA();CHKERRCUDA(err);
1893       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1894       cusparsestruct->nonzerostate = A->nonzerostate;
1895     }
1896     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
1897   }
1898   PetscFunctionReturn(0);
1899 }
1900 
1901 struct VecCUDAPlusEquals
1902 {
1903   template <typename Tuple>
1904   __host__ __device__
1905   void operator()(Tuple t)
1906   {
1907     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1908   }
1909 };
1910 
1911 struct VecCUDAEquals
1912 {
1913   template <typename Tuple>
1914   __host__ __device__
1915   void operator()(Tuple t)
1916   {
1917     thrust::get<1>(t) = thrust::get<0>(t);
1918   }
1919 };
1920 
1921 struct VecCUDAEqualsReverse
1922 {
1923   template <typename Tuple>
1924   __host__ __device__
1925   void operator()(Tuple t)
1926   {
1927     thrust::get<0>(t) = thrust::get<1>(t);
1928   }
1929 };
1930 
1931 struct MatMatCusparse {
1932   PetscBool             cisdense;
1933   PetscScalar           *Bt;
1934   Mat                   X;
1935   PetscBool             reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
1936   PetscLogDouble        flops;
1937   CsrMatrix             *Bcsr;
1938 
1939 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1940   cusparseSpMatDescr_t  matSpBDescr;
1941   PetscBool             initialized;   /* C = alpha op(A) op(B) + beta C */
1942   cusparseDnMatDescr_t  matBDescr;
1943   cusparseDnMatDescr_t  matCDescr;
1944   PetscInt              Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
1945  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
1946   void                  *dBuffer4;
1947   void                  *dBuffer5;
1948  #endif
1949   size_t                mmBufferSize;
1950   void                  *mmBuffer;
1951   void                  *mmBuffer2; /* SpGEMM WorkEstimation buffer */
1952   cusparseSpGEMMDescr_t spgemmDesc;
1953 #endif
1954 };
1955 
1956 static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
1957 {
1958   PetscErrorCode   ierr;
1959   MatMatCusparse   *mmdata = (MatMatCusparse *)data;
1960   cudaError_t      cerr;
1961  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1962   cusparseStatus_t stat;
1963  #endif
1964 
1965   PetscFunctionBegin;
1966   cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr);
1967   delete mmdata->Bcsr;
1968  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1969   if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); }
1970   if (mmdata->matBDescr)   { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); }
1971   if (mmdata->matCDescr)   { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); }
1972   if (mmdata->spgemmDesc)  { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); }
1973  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
1974   if (mmdata->dBuffer4)  { cerr = cudaFree(mmdata->dBuffer4);CHKERRCUDA(cerr); }
1975   if (mmdata->dBuffer5)  { cerr = cudaFree(mmdata->dBuffer5);CHKERRCUDA(cerr); }
1976  #endif
1977   if (mmdata->mmBuffer)  { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); }
1978   if (mmdata->mmBuffer2) { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); }
1979  #endif
1980   ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr);
1981   ierr = PetscFree(data);CHKERRQ(ierr);
1982   PetscFunctionReturn(0);
1983 }
1984 
1985 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
1986 
1987 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1988 {
1989   Mat_Product                  *product = C->product;
1990   Mat                          A,B;
1991   PetscInt                     m,n,blda,clda;
1992   PetscBool                    flg,biscuda;
1993   Mat_SeqAIJCUSPARSE           *cusp;
1994   cusparseStatus_t             stat;
1995   cusparseOperation_t          opA;
1996   const PetscScalar            *barray;
1997   PetscScalar                  *carray;
1998   PetscErrorCode               ierr;
1999   MatMatCusparse               *mmdata;
2000   Mat_SeqAIJCUSPARSEMultStruct *mat;
2001   CsrMatrix                    *csrmat;
2002 
2003   PetscFunctionBegin;
2004   MatCheckProduct(C,1);
2005   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
2006   mmdata = (MatMatCusparse*)product->data;
2007   A    = product->A;
2008   B    = product->B;
2009   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2010   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2011   /* currently CopyToGpu does not copy if the matrix is bound to CPU
2012      Instead of silently accepting the wrong answer, I prefer to raise the error */
2013   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2014   ierr   = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2015   cusp   = (Mat_SeqAIJCUSPARSE*)A->spptr;
2016   switch (product->type) {
2017   case MATPRODUCT_AB:
2018   case MATPRODUCT_PtAP:
2019     mat = cusp->mat;
2020     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2021     m   = A->rmap->n;
2022     n   = B->cmap->n;
2023     break;
2024   case MATPRODUCT_AtB:
2025     if (!A->form_explicit_transpose) {
2026       mat = cusp->mat;
2027       opA = CUSPARSE_OPERATION_TRANSPOSE;
2028     } else {
2029       ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr);
2030       mat  = cusp->matTranspose;
2031       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
2032     }
2033     m = A->cmap->n;
2034     n = B->cmap->n;
2035     break;
2036   case MATPRODUCT_ABt:
2037   case MATPRODUCT_RARt:
2038     mat = cusp->mat;
2039     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2040     m   = A->rmap->n;
2041     n   = B->rmap->n;
2042     break;
2043   default:
2044     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2045   }
2046   if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing Mat_SeqAIJCUSPARSEMultStruct");
2047   csrmat = (CsrMatrix*)mat->mat;
2048   /* if the user passed a CPU matrix, copy the data to the GPU */
2049   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr);
2050   if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);}
2051   ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr);
2052 
2053   ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr);
2054   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2055     ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2056     ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr);
2057   } else {
2058     ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr);
2059     ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr);
2060   }
2061 
2062   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2063  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2064   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2065   /* (re)allcoate mmBuffer if not initialized or LDAs are different */
2066   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2067     size_t mmBufferSize;
2068     if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;}
2069     if (!mmdata->matBDescr) {
2070       stat         = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2071       mmdata->Blda = blda;
2072     }
2073 
2074     if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;}
2075     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2076       stat         = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2077       mmdata->Clda = clda;
2078     }
2079 
2080     if (!mat->matDescr) {
2081       stat = cusparseCreateCsr(&mat->matDescr,
2082                                csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
2083                                csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
2084                                csrmat->values->data().get(),
2085                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2086                                CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
2087     }
2088     stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
2089                                    mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2090                                    mmdata->matCDescr,cusparse_scalartype,
2091                                    cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat);
2092     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2093       cudaError_t cerr;
2094       cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr);
2095       cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr);
2096       mmdata->mmBufferSize = mmBufferSize;
2097     }
2098     mmdata->initialized = PETSC_TRUE;
2099   } else {
2100     /* to be safe, always update pointers of the mats */
2101     stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat);
2102     stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat);
2103     stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat);
2104   }
2105 
2106   /* do cusparseSpMM, which supports transpose on B */
2107   stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
2108                       mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2109                       mmdata->matCDescr,cusparse_scalartype,
2110                       cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2111  #else
2112   PetscInt k;
2113   /* cusparseXcsrmm does not support transpose on B */
2114   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2115     cublasHandle_t cublasv2handle;
2116     cublasStatus_t cerr;
2117 
2118     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
2119     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
2120                        B->cmap->n,B->rmap->n,
2121                        &PETSC_CUSPARSE_ONE ,barray,blda,
2122                        &PETSC_CUSPARSE_ZERO,barray,blda,
2123                        mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr);
2124     blda = B->cmap->n;
2125     k    = B->cmap->n;
2126   } else {
2127     k    = B->rmap->n;
2128   }
2129 
2130   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2131   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
2132                            csrmat->num_entries,mat->alpha_one,mat->descr,
2133                            csrmat->values->data().get(),
2134                            csrmat->row_offsets->data().get(),
2135                            csrmat->column_indices->data().get(),
2136                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
2137                            carray,clda);CHKERRCUSPARSE(stat);
2138  #endif
2139   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2140   ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr);
2141   ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr);
2142   if (product->type == MATPRODUCT_RARt) {
2143     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2144     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2145   } else if (product->type == MATPRODUCT_PtAP) {
2146     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2147     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2148   } else {
2149     ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr);
2150   }
2151   if (mmdata->cisdense) {
2152     ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr);
2153   }
2154   if (!biscuda) {
2155     ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
2156   }
2157   PetscFunctionReturn(0);
2158 }
2159 
2160 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2161 {
2162   Mat_Product        *product = C->product;
2163   Mat                A,B;
2164   PetscInt           m,n;
2165   PetscBool          cisdense,flg;
2166   PetscErrorCode     ierr;
2167   MatMatCusparse     *mmdata;
2168   Mat_SeqAIJCUSPARSE *cusp;
2169 
2170   PetscFunctionBegin;
2171   MatCheckProduct(C,1);
2172   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
2173   A    = product->A;
2174   B    = product->B;
2175   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2176   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2177   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2178   if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2179   switch (product->type) {
2180   case MATPRODUCT_AB:
2181     m = A->rmap->n;
2182     n = B->cmap->n;
2183     break;
2184   case MATPRODUCT_AtB:
2185     m = A->cmap->n;
2186     n = B->cmap->n;
2187     break;
2188   case MATPRODUCT_ABt:
2189     m = A->rmap->n;
2190     n = B->rmap->n;
2191     break;
2192   case MATPRODUCT_PtAP:
2193     m = B->cmap->n;
2194     n = B->cmap->n;
2195     break;
2196   case MATPRODUCT_RARt:
2197     m = B->rmap->n;
2198     n = B->rmap->n;
2199     break;
2200   default:
2201     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2202   }
2203   ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2204   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2205   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr);
2206   ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr);
2207 
2208   /* product data */
2209   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2210   mmdata->cisdense = cisdense;
2211  #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
2212   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2213   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2214     cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
2215   }
2216  #endif
2217   /* for these products we need intermediate storage */
2218   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2219     ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr);
2220     ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr);
2221     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2222       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr);
2223     } else {
2224       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr);
2225     }
2226   }
2227   C->product->data    = mmdata;
2228   C->product->destroy = MatDestroy_MatMatCusparse;
2229 
2230   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2231   PetscFunctionReturn(0);
2232 }
2233 
2234 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2235 {
2236   Mat_Product                  *product = C->product;
2237   Mat                          A,B;
2238   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2239   Mat_SeqAIJ                   *c = (Mat_SeqAIJ*)C->data;
2240   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2241   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2242   PetscBool                    flg;
2243   PetscErrorCode               ierr;
2244   cusparseStatus_t             stat;
2245   cudaError_t                  cerr;
2246   MatProductType               ptype;
2247   MatMatCusparse               *mmdata;
2248 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2249   cusparseSpMatDescr_t         BmatSpDescr;
2250 #endif
2251   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2252 
2253   PetscFunctionBegin;
2254   MatCheckProduct(C,1);
2255   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
2256   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2257   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for C of type %s",((PetscObject)C)->type_name);
2258   mmdata = (MatMatCusparse*)C->product->data;
2259   A = product->A;
2260   B = product->B;
2261   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2262     mmdata->reusesym = PETSC_FALSE;
2263     Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2264     if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2265     Cmat = Ccusp->mat;
2266     if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]);
2267     Ccsr = (CsrMatrix*)Cmat->mat;
2268     if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
2269     goto finalize;
2270   }
2271   if (!c->nz) goto finalize;
2272   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2273   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2274   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2275   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
2276   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2277   if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2278   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2279   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2280   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2281   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2282   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2283   if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2284   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2285   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2286 
2287   ptype = product->type;
2288   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
2289   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
2290   switch (ptype) {
2291   case MATPRODUCT_AB:
2292     Amat = Acusp->mat;
2293     Bmat = Bcusp->mat;
2294     break;
2295   case MATPRODUCT_AtB:
2296     Amat = Acusp->matTranspose;
2297     Bmat = Bcusp->mat;
2298     break;
2299   case MATPRODUCT_ABt:
2300     Amat = Acusp->mat;
2301     Bmat = Bcusp->matTranspose;
2302     break;
2303   default:
2304     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2305   }
2306   Cmat = Ccusp->mat;
2307   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2308   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2309   if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[ptype]);
2310   Acsr = (CsrMatrix*)Amat->mat;
2311   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */
2312   Ccsr = (CsrMatrix*)Cmat->mat;
2313   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
2314   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
2315   if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
2316   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2317 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2318   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2319   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2320   #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2321     stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB,
2322                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2323                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2324                                mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2325   #else
2326     stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2327                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2328                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2329                                mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2330     stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB,
2331                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2332                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2333   #endif
2334 #else
2335   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB,
2336                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2337                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2338                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2339                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2340 #endif
2341   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2342   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2343   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2344   C->offloadmask = PETSC_OFFLOAD_GPU;
2345 finalize:
2346   /* shorter version of MatAssemblyEnd_SeqAIJ */
2347   ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr);
2348   ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
2349   ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr);
2350   c->reallocs         = 0;
2351   C->info.mallocs    += 0;
2352   C->info.nz_unneeded = 0;
2353   C->assembled = C->was_assembled = PETSC_TRUE;
2354   C->num_ass++;
2355   PetscFunctionReturn(0);
2356 }
2357 
2358 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2359 {
2360   Mat_Product                  *product = C->product;
2361   Mat                          A,B;
2362   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2363   Mat_SeqAIJ                   *a,*b,*c;
2364   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2365   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2366   PetscInt                     i,j,m,n,k;
2367   PetscBool                    flg;
2368   PetscErrorCode               ierr;
2369   cusparseStatus_t             stat;
2370   cudaError_t                  cerr;
2371   MatProductType               ptype;
2372   MatMatCusparse               *mmdata;
2373   PetscLogDouble               flops;
2374   PetscBool                    biscompressed,ciscompressed;
2375 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2376   int64_t                      C_num_rows1, C_num_cols1, C_nnz1;
2377   cusparseSpMatDescr_t         BmatSpDescr;
2378 #else
2379   int                          cnz;
2380 #endif
2381   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2382 
2383   PetscFunctionBegin;
2384   MatCheckProduct(C,1);
2385   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
2386   A    = product->A;
2387   B    = product->B;
2388   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2389   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2390   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2391   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
2392   a = (Mat_SeqAIJ*)A->data;
2393   b = (Mat_SeqAIJ*)B->data;
2394   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2395   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2396   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2397   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2398 
2399   /* product data */
2400   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2401   C->product->data    = mmdata;
2402   C->product->destroy = MatDestroy_MatMatCusparse;
2403 
2404   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2405   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2406   ptype = product->type;
2407   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
2408   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
2409   biscompressed = PETSC_FALSE;
2410   ciscompressed = PETSC_FALSE;
2411   switch (ptype) {
2412   case MATPRODUCT_AB:
2413     m = A->rmap->n;
2414     n = B->cmap->n;
2415     k = A->cmap->n;
2416     Amat = Acusp->mat;
2417     Bmat = Bcusp->mat;
2418     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2419     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2420     break;
2421   case MATPRODUCT_AtB:
2422     m = A->cmap->n;
2423     n = B->cmap->n;
2424     k = A->rmap->n;
2425     ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr);
2426     Amat = Acusp->matTranspose;
2427     Bmat = Bcusp->mat;
2428     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2429     break;
2430   case MATPRODUCT_ABt:
2431     m = A->rmap->n;
2432     n = B->rmap->n;
2433     k = A->cmap->n;
2434     ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(B);CHKERRQ(ierr);
2435     Amat = Acusp->mat;
2436     Bmat = Bcusp->matTranspose;
2437     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2438     break;
2439   default:
2440     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2441   }
2442 
2443   /* create cusparse matrix */
2444   ierr  = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2445   ierr  = MatSetType(C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2446   c     = (Mat_SeqAIJ*)C->data;
2447   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2448   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
2449   Ccsr  = new CsrMatrix;
2450 
2451   c->compressedrow.use = ciscompressed;
2452   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2453     c->compressedrow.nrows = a->compressedrow.nrows;
2454     ierr = PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);CHKERRQ(ierr);
2455     ierr = PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);CHKERRQ(ierr);
2456     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
2457     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2458     Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows);
2459   } else {
2460     c->compressedrow.nrows  = 0;
2461     c->compressedrow.i      = NULL;
2462     c->compressedrow.rindex = NULL;
2463     Ccusp->workVector       = NULL;
2464     Cmat->cprowIndices      = NULL;
2465   }
2466   Ccusp->nrows    = ciscompressed ? c->compressedrow.nrows : m;
2467   Ccusp->mat      = Cmat;
2468   Ccusp->mat->mat = Ccsr;
2469   Ccsr->num_rows    = Ccusp->nrows;
2470   Ccsr->num_cols    = n;
2471   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1);
2472   stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
2473   stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
2474   stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
2475   cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
2476   cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
2477   cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
2478   cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2479   cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2480   cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2481   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
2482     thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0);
2483     c->nz = 0;
2484     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2485     Ccsr->values = new THRUSTARRAY(c->nz);
2486     goto finalizesym;
2487   }
2488 
2489   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2490   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2491   Acsr = (CsrMatrix*)Amat->mat;
2492   if (!biscompressed) {
2493     Bcsr = (CsrMatrix*)Bmat->mat;
2494 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2495     BmatSpDescr = Bmat->matDescr;
2496 #endif
2497   } else { /* we need to use row offsets for the full matrix */
2498     CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat;
2499     Bcsr = new CsrMatrix;
2500     Bcsr->num_rows       = B->rmap->n;
2501     Bcsr->num_cols       = cBcsr->num_cols;
2502     Bcsr->num_entries    = cBcsr->num_entries;
2503     Bcsr->column_indices = cBcsr->column_indices;
2504     Bcsr->values         = cBcsr->values;
2505     if (!Bcusp->rowoffsets_gpu) {
2506       Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
2507       Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
2508       ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
2509     }
2510     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2511     mmdata->Bcsr = Bcsr;
2512 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2513     if (Bcsr->num_rows && Bcsr->num_cols) {
2514       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries,
2515                                Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2516                                Bcsr->values->data().get(),
2517                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2518                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2519     }
2520     BmatSpDescr = mmdata->matSpBDescr;
2521 #endif
2522   }
2523   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
2524   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
2525   /* precompute flops count */
2526   if (ptype == MATPRODUCT_AB) {
2527     for (i=0, flops = 0; i<A->rmap->n; i++) {
2528       const PetscInt st = a->i[i];
2529       const PetscInt en = a->i[i+1];
2530       for (j=st; j<en; j++) {
2531         const PetscInt brow = a->j[j];
2532         flops += 2.*(b->i[brow+1] - b->i[brow]);
2533       }
2534     }
2535   } else if (ptype == MATPRODUCT_AtB) {
2536     for (i=0, flops = 0; i<A->rmap->n; i++) {
2537       const PetscInt anzi = a->i[i+1] - a->i[i];
2538       const PetscInt bnzi = b->i[i+1] - b->i[i];
2539       flops += (2.*anzi)*bnzi;
2540     }
2541   } else { /* TODO */
2542     flops = 0.;
2543   }
2544 
2545   mmdata->flops = flops;
2546   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2547 
2548 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2549   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2550   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0,
2551                           NULL, NULL, NULL,
2552                           CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2553                           CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2554   stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2555  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2556  {
2557   /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
2558      We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
2559   */
2560   void*  dBuffer1 = NULL;
2561   void*  dBuffer2 = NULL;
2562   void*  dBuffer3 = NULL;
2563   /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
2564   size_t bufferSize1 = 0;
2565   size_t bufferSize2 = 0;
2566   size_t bufferSize3 = 0;
2567   size_t bufferSize4 = 0;
2568   size_t bufferSize5 = 0;
2569 
2570   /*----------------------------------------------------------------------*/
2571   /* ask bufferSize1 bytes for external memory */
2572   stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2573                                             CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2574                                             &bufferSize1, NULL);CHKERRCUSPARSE(stat);
2575   cerr = cudaMalloc((void**) &dBuffer1, bufferSize1);CHKERRCUDA(cerr);
2576   /* inspect the matrices A and B to understand the memory requirement for the next step */
2577   stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2578                                             CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2579                                             &bufferSize1, dBuffer1);CHKERRCUSPARSE(stat);
2580 
2581   /*----------------------------------------------------------------------*/
2582   stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2583                                  CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2584                                  &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);CHKERRCUSPARSE(stat);
2585   cerr = cudaMalloc((void**) &dBuffer2, bufferSize2);CHKERRCUDA(cerr);
2586   cerr = cudaMalloc((void**) &dBuffer3, bufferSize3);CHKERRCUDA(cerr);
2587   cerr = cudaMalloc((void**) &mmdata->dBuffer4, bufferSize4);CHKERRCUDA(cerr);
2588   stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2589                                  CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2590                                  &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);CHKERRCUSPARSE(stat);
2591   cerr = cudaFree(dBuffer1);CHKERRCUDA(cerr);
2592   cerr = cudaFree(dBuffer2);CHKERRCUDA(cerr);
2593 
2594   /*----------------------------------------------------------------------*/
2595   /* get matrix C non-zero entries C_nnz1 */
2596   stat  = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat);
2597   c->nz = (PetscInt) C_nnz1;
2598   /* allocate matrix C */
2599   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2600   Ccsr->values         = new THRUSTARRAY(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2601   /* update matC with the new pointers */
2602   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2603                                 Ccsr->values->data().get());CHKERRCUSPARSE(stat);
2604 
2605   /*----------------------------------------------------------------------*/
2606   stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2607                                   CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2608                                   &bufferSize5, NULL);CHKERRCUSPARSE(stat);
2609   cerr = cudaMalloc((void**) &mmdata->dBuffer5, bufferSize5);CHKERRCUDA(cerr);
2610   stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2611                                   CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2612                                   &bufferSize5, mmdata->dBuffer5);CHKERRCUSPARSE(stat);
2613   cerr = cudaFree(dBuffer3);CHKERRCUDA(cerr);
2614   stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB,
2615                                      Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2616                                      cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2617                                      mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2618   ierr = PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufferSize4/1024,bufferSize5/1024);CHKERRQ(ierr);
2619  }
2620  #else // ~PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2621   size_t bufSize2;
2622   /* ask bufferSize bytes for external memory */
2623   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB,
2624                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2625                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2626                                        mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat);
2627   cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUDA(cerr);
2628   /* inspect the matrices A and B to understand the memory requirement for the next step */
2629   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB,
2630                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2631                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2632                                        mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat);
2633   /* ask bufferSize again bytes for external memory */
2634   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2635                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2636                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2637                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat);
2638   /* The CUSPARSE documentation is not clear, nor the API
2639      We need both buffers to perform the operations properly!
2640      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2641      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2642      is stored in the descriptor! What a messy API... */
2643   cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUDA(cerr);
2644   /* compute the intermediate product of A * B */
2645   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2646                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2647                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2648                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2649   /* get matrix C non-zero entries C_nnz1 */
2650   stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat);
2651   c->nz = (PetscInt) C_nnz1;
2652   ierr = PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024);CHKERRQ(ierr);
2653   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2654   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2655   Ccsr->values = new THRUSTARRAY(c->nz);
2656   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2657   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2658                                 Ccsr->values->data().get());CHKERRCUSPARSE(stat);
2659   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB,
2660                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2661                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2662  #endif
2663 #else
2664   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
2665   stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB,
2666                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2667                              Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2668                              Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2669                              Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat);
2670   c->nz = cnz;
2671   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2672   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2673   Ccsr->values = new THRUSTARRAY(c->nz);
2674   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2675 
2676   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2677   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
2678      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
2679      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
2680   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB,
2681                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2682                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2683                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2684                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2685 #endif
2686   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2687   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2688 finalizesym:
2689   c->singlemalloc = PETSC_FALSE;
2690   c->free_a       = PETSC_TRUE;
2691   c->free_ij      = PETSC_TRUE;
2692   ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
2693   ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
2694   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
2695     PetscInt *d_i = c->i;
2696     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
2697     THRUSTINTARRAY jj(Ccsr->column_indices->size());
2698     ii   = *Ccsr->row_offsets;
2699     jj   = *Ccsr->column_indices;
2700     if (ciscompressed) d_i = c->compressedrow.i;
2701     cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2702     cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2703   } else {
2704     PetscInt *d_i = c->i;
2705     if (ciscompressed) d_i = c->compressedrow.i;
2706     cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2707     cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2708   }
2709   if (ciscompressed) { /* need to expand host row offsets */
2710     PetscInt r = 0;
2711     c->i[0] = 0;
2712     for (k = 0; k < c->compressedrow.nrows; k++) {
2713       const PetscInt next = c->compressedrow.rindex[k];
2714       const PetscInt old = c->compressedrow.i[k];
2715       for (; r < next; r++) c->i[r+1] = old;
2716     }
2717     for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows];
2718   }
2719   ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
2720   ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
2721   ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
2722   c->maxnz = c->nz;
2723   c->nonzerorowcnt = 0;
2724   c->rmax = 0;
2725   for (k = 0; k < m; k++) {
2726     const PetscInt nn = c->i[k+1] - c->i[k];
2727     c->ilen[k] = c->imax[k] = nn;
2728     c->nonzerorowcnt += (PetscInt)!!nn;
2729     c->rmax = PetscMax(c->rmax,nn);
2730   }
2731   ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr);
2732   ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
2733   Ccsr->num_entries = c->nz;
2734 
2735   C->nonzerostate++;
2736   ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr);
2737   ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr);
2738   Ccusp->nonzerostate = C->nonzerostate;
2739   C->offloadmask   = PETSC_OFFLOAD_UNALLOCATED;
2740   C->preallocated  = PETSC_TRUE;
2741   C->assembled     = PETSC_FALSE;
2742   C->was_assembled = PETSC_FALSE;
2743   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
2744     mmdata->reusesym = PETSC_TRUE;
2745     C->offloadmask   = PETSC_OFFLOAD_GPU;
2746   }
2747   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2748   PetscFunctionReturn(0);
2749 }
2750 
2751 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
2752 
2753 /* handles sparse or dense B */
2754 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
2755 {
2756   Mat_Product    *product = mat->product;
2757   PetscErrorCode ierr;
2758   PetscBool      isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE;
2759 
2760   PetscFunctionBegin;
2761   MatCheckProduct(mat,1);
2762   ierr = PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);CHKERRQ(ierr);
2763   if (!product->A->boundtocpu && !product->B->boundtocpu) {
2764     ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);CHKERRQ(ierr);
2765   }
2766   if (product->type == MATPRODUCT_ABC) {
2767     Ciscusp = PETSC_FALSE;
2768     if (!product->C->boundtocpu) {
2769       ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);CHKERRQ(ierr);
2770     }
2771   }
2772   if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
2773     PetscBool usecpu = PETSC_FALSE;
2774     switch (product->type) {
2775     case MATPRODUCT_AB:
2776       if (product->api_user) {
2777         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMult","Mat");CHKERRQ(ierr);
2778         ierr = PetscOptionsBool("-matmatmult_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2779         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2780       } else {
2781         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AB","Mat");CHKERRQ(ierr);
2782         ierr = PetscOptionsBool("-matproduct_ab_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2783         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2784       }
2785       break;
2786     case MATPRODUCT_AtB:
2787       if (product->api_user) {
2788         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatTransposeMatMult","Mat");CHKERRQ(ierr);
2789         ierr = PetscOptionsBool("-mattransposematmult_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2790         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2791       } else {
2792         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AtB","Mat");CHKERRQ(ierr);
2793         ierr = PetscOptionsBool("-matproduct_atb_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2794         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2795       }
2796       break;
2797     case MATPRODUCT_PtAP:
2798       if (product->api_user) {
2799         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatPtAP","Mat");CHKERRQ(ierr);
2800         ierr = PetscOptionsBool("-matptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2801         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2802       } else {
2803         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_PtAP","Mat");CHKERRQ(ierr);
2804         ierr = PetscOptionsBool("-matproduct_ptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2805         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2806       }
2807       break;
2808     case MATPRODUCT_RARt:
2809       if (product->api_user) {
2810         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatRARt","Mat");CHKERRQ(ierr);
2811         ierr = PetscOptionsBool("-matrart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2812         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2813       } else {
2814         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_RARt","Mat");CHKERRQ(ierr);
2815         ierr = PetscOptionsBool("-matproduct_rart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2816         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2817       }
2818       break;
2819     case MATPRODUCT_ABC:
2820       if (product->api_user) {
2821         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMatMult","Mat");CHKERRQ(ierr);
2822         ierr = PetscOptionsBool("-matmatmatmult_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2823         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2824       } else {
2825         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_ABC","Mat");CHKERRQ(ierr);
2826         ierr = PetscOptionsBool("-matproduct_abc_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2827         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2828       }
2829       break;
2830     default:
2831       break;
2832     }
2833     if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
2834   }
2835   /* dispatch */
2836   if (isdense) {
2837     switch (product->type) {
2838     case MATPRODUCT_AB:
2839     case MATPRODUCT_AtB:
2840     case MATPRODUCT_ABt:
2841     case MATPRODUCT_PtAP:
2842     case MATPRODUCT_RARt:
2843      if (product->A->boundtocpu) {
2844         ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(mat);CHKERRQ(ierr);
2845       } else {
2846         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2847       }
2848       break;
2849     case MATPRODUCT_ABC:
2850       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2851       break;
2852     default:
2853       break;
2854     }
2855   } else if (Biscusp && Ciscusp) {
2856     switch (product->type) {
2857     case MATPRODUCT_AB:
2858     case MATPRODUCT_AtB:
2859     case MATPRODUCT_ABt:
2860       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2861       break;
2862     case MATPRODUCT_PtAP:
2863     case MATPRODUCT_RARt:
2864     case MATPRODUCT_ABC:
2865       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2866       break;
2867     default:
2868       break;
2869     }
2870   } else { /* fallback for AIJ */
2871     ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr);
2872   }
2873   PetscFunctionReturn(0);
2874 }
2875 
2876 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2877 {
2878   PetscErrorCode ierr;
2879 
2880   PetscFunctionBegin;
2881   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2882   PetscFunctionReturn(0);
2883 }
2884 
2885 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
2886 {
2887   PetscErrorCode ierr;
2888 
2889   PetscFunctionBegin;
2890   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2891   PetscFunctionReturn(0);
2892 }
2893 
2894 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2895 {
2896   PetscErrorCode ierr;
2897 
2898   PetscFunctionBegin;
2899   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
2900   PetscFunctionReturn(0);
2901 }
2902 
2903 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2904 {
2905   PetscErrorCode ierr;
2906 
2907   PetscFunctionBegin;
2908   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
2909   PetscFunctionReturn(0);
2910 }
2911 
2912 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2913 {
2914   PetscErrorCode ierr;
2915 
2916   PetscFunctionBegin;
2917   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2918   PetscFunctionReturn(0);
2919 }
2920 
2921 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y)
2922 {
2923   int i = blockIdx.x*blockDim.x + threadIdx.x;
2924   if (i < n) y[idx[i]] += x[i];
2925 }
2926 
2927 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
2928 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
2929 {
2930   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
2931   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
2932   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
2933   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
2934   PetscErrorCode               ierr;
2935   cusparseStatus_t             stat;
2936   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2937   PetscBool                    compressed;
2938 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2939   PetscInt                     nx,ny;
2940 #endif
2941 
2942   PetscFunctionBegin;
2943   if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Hermitian and not transpose not supported");
2944   if (!a->nonzerorowcnt) {
2945     if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);}
2946     else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);}
2947     PetscFunctionReturn(0);
2948   }
2949   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
2950   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2951   if (!trans) {
2952     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2953     if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
2954   } else {
2955     if (herm || !A->form_explicit_transpose) {
2956       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
2957       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2958     } else {
2959       if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr);}
2960       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
2961     }
2962   }
2963   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
2964   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
2965 
2966   try {
2967     ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
2968     if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */
2969     else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */
2970 
2971     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2972     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2973       /* z = A x + beta y.
2974          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
2975          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
2976       */
2977       xptr = xarray;
2978       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
2979       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
2980      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2981       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
2982           allocated to accommodate different uses. So we get the length info directly from mat.
2983        */
2984       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2985         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2986         nx = mat->num_cols;
2987         ny = mat->num_rows;
2988       }
2989      #endif
2990     } else {
2991       /* z = A^T x + beta y
2992          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
2993          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
2994        */
2995       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
2996       dptr = zarray;
2997       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
2998       if (compressed) { /* Scatter x to work vector */
2999         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3000         thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
3001                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3002                          VecCUDAEqualsReverse());
3003       }
3004      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3005       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3006         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3007         nx = mat->num_rows;
3008         ny = mat->num_cols;
3009       }
3010      #endif
3011     }
3012 
3013     /* csr_spmv does y = alpha op(A) x + beta y */
3014     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3015      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3016       if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
3017       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
3018         cudaError_t cerr;
3019         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
3020         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
3021         stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
3022                                 matstruct->matDescr,
3023                                 matstruct->cuSpMV[opA].vecXDescr, beta,
3024                                 matstruct->cuSpMV[opA].vecYDescr,
3025                                 cusparse_scalartype,
3026                                 cusparsestruct->spmvAlg,
3027                                 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat);
3028         cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr);
3029 
3030         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3031       } else {
3032         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
3033         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat);
3034         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat);
3035       }
3036 
3037       stat = cusparseSpMV(cusparsestruct->handle, opA,
3038                                matstruct->alpha_one,
3039                                matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTransposeForMult() */
3040                                matstruct->cuSpMV[opA].vecXDescr,
3041                                beta,
3042                                matstruct->cuSpMV[opA].vecYDescr,
3043                                cusparse_scalartype,
3044                                cusparsestruct->spmvAlg,
3045                                matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat);
3046      #else
3047       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3048       stat = cusparse_csr_spmv(cusparsestruct->handle, opA,
3049                                mat->num_rows, mat->num_cols,
3050                                mat->num_entries, matstruct->alpha_one, matstruct->descr,
3051                                mat->values->data().get(), mat->row_offsets->data().get(),
3052                                mat->column_indices->data().get(), xptr, beta,
3053                                dptr);CHKERRCUSPARSE(stat);
3054      #endif
3055     } else {
3056       if (cusparsestruct->nrows) {
3057        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3058         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3059        #else
3060         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
3061         stat = cusparse_hyb_spmv(cusparsestruct->handle, opA,
3062                                  matstruct->alpha_one, matstruct->descr, hybMat,
3063                                  xptr, beta,
3064                                  dptr);CHKERRCUSPARSE(stat);
3065        #endif
3066       }
3067     }
3068     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3069 
3070     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3071       if (yy) { /* MatMultAdd: zz = A*xx + yy */
3072         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3073           ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */
3074         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3075           ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
3076         }
3077       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3078         ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
3079       }
3080 
3081       /* ScatterAdd the result from work vector into the full vector when A is compressed */
3082       if (compressed) {
3083         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3084         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred)
3085            and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
3086            prevent that. So I just add a ScatterAdd kernel.
3087          */
3088        #if 0
3089         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
3090         thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
3091                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
3092                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3093                          VecCUDAPlusEquals());
3094        #else
3095         PetscInt n = matstruct->cprowIndices->size();
3096         ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray);
3097        #endif
3098         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3099       }
3100     } else {
3101       if (yy && yy != zz) {
3102         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
3103       }
3104     }
3105     ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
3106     if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);}
3107     else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);}
3108   } catch(char *ex) {
3109     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
3110   }
3111   if (yy) {
3112     ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
3113   } else {
3114     ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr);
3115   }
3116   PetscFunctionReturn(0);
3117 }
3118 
3119 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
3120 {
3121   PetscErrorCode ierr;
3122 
3123   PetscFunctionBegin;
3124   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
3125   PetscFunctionReturn(0);
3126 }
3127 
3128 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
3129 {
3130   PetscErrorCode     ierr;
3131   PetscObjectState   onnz = A->nonzerostate;
3132   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3133 
3134   PetscFunctionBegin;
3135   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
3136   if (onnz != A->nonzerostate && cusp->deviceMat) {
3137     cudaError_t cerr;
3138 
3139     ierr = PetscInfo(A,"Destroy device mat since nonzerostate changed\n");CHKERRQ(ierr);
3140     cerr = cudaFree(cusp->deviceMat);CHKERRCUDA(cerr);
3141     cusp->deviceMat = NULL;
3142   }
3143   PetscFunctionReturn(0);
3144 }
3145 
3146 /* --------------------------------------------------------------------------------*/
3147 /*@
3148    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
3149    (the default parallel PETSc format). This matrix will ultimately pushed down
3150    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
3151    assembly performance the user should preallocate the matrix storage by setting
3152    the parameter nz (or the array nnz).  By setting these parameters accurately,
3153    performance during matrix assembly can be increased by more than a factor of 50.
3154 
3155    Collective
3156 
3157    Input Parameters:
3158 +  comm - MPI communicator, set to PETSC_COMM_SELF
3159 .  m - number of rows
3160 .  n - number of columns
3161 .  nz - number of nonzeros per row (same for all rows)
3162 -  nnz - array containing the number of nonzeros in the various rows
3163          (possibly different for each row) or NULL
3164 
3165    Output Parameter:
3166 .  A - the matrix
3167 
3168    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
3169    MatXXXXSetPreallocation() paradgm instead of this routine directly.
3170    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
3171 
3172    Notes:
3173    If nnz is given then nz is ignored
3174 
3175    The AIJ format (also called the Yale sparse matrix format or
3176    compressed row storage), is fully compatible with standard Fortran 77
3177    storage.  That is, the stored row and column indices can begin at
3178    either one (as in Fortran) or zero.  See the users' manual for details.
3179 
3180    Specify the preallocated storage with either nz or nnz (not both).
3181    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
3182    allocation.  For large problems you MUST preallocate memory or you
3183    will get TERRIBLE performance, see the users' manual chapter on matrices.
3184 
3185    By default, this format uses inodes (identical nodes) when possible, to
3186    improve numerical efficiency of matrix-vector products and solves. We
3187    search for consecutive rows with the same nonzero structure, thereby
3188    reusing matrix information to achieve increased efficiency.
3189 
3190    Level: intermediate
3191 
3192 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
3193 @*/
3194 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
3195 {
3196   PetscErrorCode ierr;
3197 
3198   PetscFunctionBegin;
3199   ierr = MatCreate(comm,A);CHKERRQ(ierr);
3200   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
3201   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3202   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
3203   PetscFunctionReturn(0);
3204 }
3205 
3206 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3207 {
3208   PetscErrorCode ierr;
3209 
3210   PetscFunctionBegin;
3211   if (A->factortype == MAT_FACTOR_NONE) {
3212     ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
3213   } else {
3214     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
3215   }
3216   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr);
3217   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
3218   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3219   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3220   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3221   ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
3222   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
3223   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
3224   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
3225   PetscFunctionReturn(0);
3226 }
3227 
3228 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
3229 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
3230 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
3231 {
3232   PetscErrorCode ierr;
3233 
3234   PetscFunctionBegin;
3235   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
3236   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
3237   PetscFunctionReturn(0);
3238 }
3239 
3240 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str)
3241 {
3242   PetscErrorCode     ierr;
3243   Mat_SeqAIJ         *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
3244   Mat_SeqAIJCUSPARSE *cy;
3245   Mat_SeqAIJCUSPARSE *cx;
3246   PetscScalar        *ay;
3247   const PetscScalar  *ax;
3248   CsrMatrix          *csry,*csrx;
3249 
3250   PetscFunctionBegin;
3251   cy = (Mat_SeqAIJCUSPARSE*)Y->spptr;
3252   cx = (Mat_SeqAIJCUSPARSE*)X->spptr;
3253   if (X->ops->axpy != Y->ops->axpy) {
3254     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr);
3255     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
3256     PetscFunctionReturn(0);
3257   }
3258   /* if we are here, it means both matrices are bound to GPU */
3259   ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr);
3260   ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr);
3261   if (cy->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)Y),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
3262   if (cx->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
3263   csry = (CsrMatrix*)cy->mat->mat;
3264   csrx = (CsrMatrix*)cx->mat->mat;
3265   /* see if we can turn this into a cublas axpy */
3266   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3267     bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin());
3268     if (eq) {
3269       eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin());
3270     }
3271     if (eq) str = SAME_NONZERO_PATTERN;
3272   }
3273   /* spgeam is buggy with one column */
3274   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3275 
3276   if (str == SUBSET_NONZERO_PATTERN) {
3277     cusparseStatus_t stat;
3278     PetscScalar      b = 1.0;
3279 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3280     size_t           bufferSize;
3281     void             *buffer;
3282     cudaError_t      cerr;
3283 #endif
3284 
3285     ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr);
3286     ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3287     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
3288 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3289     stat = cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n,
3290                                           &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3291                                           &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3292                                              cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize);CHKERRCUSPARSE(stat);
3293     cerr = cudaMalloc(&buffer,bufferSize);CHKERRCUDA(cerr);
3294     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3295     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3296                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3297                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3298                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer);CHKERRCUSPARSE(stat);
3299     ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr);
3300     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3301     cerr = cudaFree(buffer);CHKERRCUDA(cerr);
3302 #else
3303     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3304     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3305                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3306                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3307                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get());CHKERRCUSPARSE(stat);
3308     ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr);
3309     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3310 #endif
3311     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
3312     ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr);
3313     ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3314     ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3315   } else if (str == SAME_NONZERO_PATTERN) {
3316     cublasHandle_t cublasv2handle;
3317     cublasStatus_t berr;
3318     PetscBLASInt   one = 1, bnz = 1;
3319 
3320     ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr);
3321     ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3322     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
3323     ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr);
3324     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3325     berr = cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one);CHKERRCUBLAS(berr);
3326     ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr);
3327     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3328     ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr);
3329     ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3330     ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3331   } else {
3332     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr);
3333     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
3334   }
3335   PetscFunctionReturn(0);
3336 }
3337 
3338 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a)
3339 {
3340   PetscErrorCode ierr;
3341   Mat_SeqAIJ     *y = (Mat_SeqAIJ*)Y->data;
3342   PetscScalar    *ay;
3343   cublasHandle_t cublasv2handle;
3344   cublasStatus_t berr;
3345   PetscBLASInt   one = 1, bnz = 1;
3346 
3347   PetscFunctionBegin;
3348   ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3349   ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
3350   ierr = PetscBLASIntCast(y->nz,&bnz);CHKERRQ(ierr);
3351   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3352   berr = cublasXscal(cublasv2handle,bnz,&a,ay,one);CHKERRCUBLAS(berr);
3353   ierr = PetscLogGpuFlops(bnz);CHKERRQ(ierr);
3354   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3355   ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3356   ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3357   PetscFunctionReturn(0);
3358 }
3359 
3360 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3361 {
3362   PetscErrorCode ierr;
3363   PetscBool      both = PETSC_FALSE;
3364   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3365 
3366   PetscFunctionBegin;
3367   if (A->factortype == MAT_FACTOR_NONE) {
3368     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
3369     if (spptr->mat) {
3370       CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat;
3371       if (matrix->values) {
3372         both = PETSC_TRUE;
3373         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3374       }
3375     }
3376     if (spptr->matTranspose) {
3377       CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat;
3378       if (matrix->values) {
3379         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3380       }
3381     }
3382   }
3383   //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr);
3384   ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr);
3385   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr);
3386   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3387   else A->offloadmask = PETSC_OFFLOAD_CPU;
3388   PetscFunctionReturn(0);
3389 }
3390 
3391 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
3392 {
3393   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3394   PetscErrorCode ierr;
3395 
3396   PetscFunctionBegin;
3397   if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0);
3398   if (flg) {
3399     ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
3400 
3401     A->ops->scale                     = MatScale_SeqAIJ;
3402     A->ops->axpy                      = MatAXPY_SeqAIJ;
3403     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3404     A->ops->mult                      = MatMult_SeqAIJ;
3405     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3406     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3407     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3408     A->ops->multhermitiantranspose    = NULL;
3409     A->ops->multhermitiantransposeadd = NULL;
3410     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
3411     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr);
3412     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3413     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3414     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
3415     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
3416     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr);
3417     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3418   } else {
3419     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
3420     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3421     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3422     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
3423     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
3424     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
3425     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
3426     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3427     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3428     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
3429     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE);CHKERRQ(ierr);
3430     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3431     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3432     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3433     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3434     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);CHKERRQ(ierr);
3435     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3436   }
3437   A->boundtocpu = flg;
3438   a->inode.use = flg;
3439   PetscFunctionReturn(0);
3440 }
3441 
3442 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
3443 {
3444   PetscErrorCode   ierr;
3445   cusparseStatus_t stat;
3446   Mat              B;
3447 
3448   PetscFunctionBegin;
3449   ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); /* first use of CUSPARSE may be via MatConvert */
3450   if (reuse == MAT_INITIAL_MATRIX) {
3451     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
3452   } else if (reuse == MAT_REUSE_MATRIX) {
3453     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
3454   }
3455   B = *newmat;
3456 
3457   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
3458   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
3459 
3460   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
3461     if (B->factortype == MAT_FACTOR_NONE) {
3462       Mat_SeqAIJCUSPARSE *spptr;
3463       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3464       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3465       stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
3466       spptr->format     = MAT_CUSPARSE_CSR;
3467      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3468       spptr->spmvAlg    = CUSPARSE_CSRMV_ALG1;    /* default, since we only support csr */
3469       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
3470       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
3471      #endif
3472       B->spptr = spptr;
3473     } else {
3474       Mat_SeqAIJCUSPARSETriFactors *spptr;
3475 
3476       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3477       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3478       stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
3479       B->spptr = spptr;
3480     }
3481     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
3482   }
3483   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
3484   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
3485   B->ops->setoption      = MatSetOption_SeqAIJCUSPARSE;
3486   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
3487   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
3488   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
3489 
3490   ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr);
3491   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3492   ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
3493   PetscFunctionReturn(0);
3494 }
3495 
3496 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
3497 {
3498   PetscErrorCode ierr;
3499 
3500   PetscFunctionBegin;
3501   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
3502   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
3503   PetscFunctionReturn(0);
3504 }
3505 
3506 /*MC
3507    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
3508 
3509    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
3510    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
3511    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
3512 
3513    Options Database Keys:
3514 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
3515 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3516 -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3517 
3518   Level: beginner
3519 
3520 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
3521 M*/
3522 
3523 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*);
3524 
3525 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
3526 {
3527   PetscErrorCode ierr;
3528 
3529   PetscFunctionBegin;
3530   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSEBAND, MATSEQAIJ, MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band);CHKERRQ(ierr);
3531   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3532   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3533   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3534   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3535 
3536   PetscFunctionReturn(0);
3537 }
3538 
3539 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
3540 {
3541   PetscErrorCode   ierr;
3542   cusparseStatus_t stat;
3543 
3544   PetscFunctionBegin;
3545   if (*cusparsestruct) {
3546     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr);
3547     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr);
3548     delete (*cusparsestruct)->workVector;
3549     delete (*cusparsestruct)->rowoffsets_gpu;
3550     delete (*cusparsestruct)->cooPerm;
3551     delete (*cusparsestruct)->cooPerm_a;
3552     delete (*cusparsestruct)->csr2csc_i;
3553     if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);}
3554     ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr);
3555   }
3556   PetscFunctionReturn(0);
3557 }
3558 
3559 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
3560 {
3561   PetscFunctionBegin;
3562   if (*mat) {
3563     delete (*mat)->values;
3564     delete (*mat)->column_indices;
3565     delete (*mat)->row_offsets;
3566     delete *mat;
3567     *mat = 0;
3568   }
3569   PetscFunctionReturn(0);
3570 }
3571 
3572 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
3573 {
3574   cusparseStatus_t stat;
3575   PetscErrorCode   ierr;
3576 
3577   PetscFunctionBegin;
3578   if (*trifactor) {
3579     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
3580     if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
3581     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
3582     if ((*trifactor)->solveBuffer)   {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);}
3583     if ((*trifactor)->AA_h)   {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);}
3584    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3585     if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);}
3586    #endif
3587     ierr = PetscFree(*trifactor);CHKERRQ(ierr);
3588   }
3589   PetscFunctionReturn(0);
3590 }
3591 
3592 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
3593 {
3594   CsrMatrix        *mat;
3595   cusparseStatus_t stat;
3596   cudaError_t      err;
3597 
3598   PetscFunctionBegin;
3599   if (*matstruct) {
3600     if ((*matstruct)->mat) {
3601       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
3602        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3603         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3604        #else
3605         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
3606         stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
3607        #endif
3608       } else {
3609         mat = (CsrMatrix*)(*matstruct)->mat;
3610         CsrMatrix_Destroy(&mat);
3611       }
3612     }
3613     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
3614     delete (*matstruct)->cprowIndices;
3615     if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); }
3616     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
3617     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
3618 
3619    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3620     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
3621     if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);}
3622     for (int i=0; i<3; i++) {
3623       if (mdata->cuSpMV[i].initialized) {
3624         err  = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err);
3625         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat);
3626         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat);
3627       }
3628     }
3629    #endif
3630     delete *matstruct;
3631     *matstruct = NULL;
3632   }
3633   PetscFunctionReturn(0);
3634 }
3635 
3636 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p* trifactors)
3637 {
3638   PetscErrorCode ierr;
3639 
3640   PetscFunctionBegin;
3641   if (*trifactors) {
3642     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr);
3643     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr);
3644     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr);
3645     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr);
3646     delete (*trifactors)->rpermIndices;
3647     delete (*trifactors)->cpermIndices;
3648     delete (*trifactors)->workVector;
3649     (*trifactors)->rpermIndices = NULL;
3650     (*trifactors)->cpermIndices = NULL;
3651     (*trifactors)->workVector = NULL;
3652     if ((*trifactors)->a_band_d)   {cudaError_t cerr = cudaFree((*trifactors)->a_band_d);CHKERRCUDA(cerr);}
3653     if ((*trifactors)->i_band_d)   {cudaError_t cerr = cudaFree((*trifactors)->i_band_d);CHKERRCUDA(cerr);}
3654     (*trifactors)->init_dev_prop = PETSC_FALSE;
3655   }
3656   PetscFunctionReturn(0);
3657 }
3658 
3659 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
3660 {
3661   PetscErrorCode   ierr;
3662   cusparseHandle_t handle;
3663   cusparseStatus_t stat;
3664 
3665   PetscFunctionBegin;
3666   if (*trifactors) {
3667     ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr);
3668     if (handle = (*trifactors)->handle) {
3669       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
3670     }
3671     ierr = PetscFree(*trifactors);CHKERRQ(ierr);
3672   }
3673   PetscFunctionReturn(0);
3674 }
3675 
3676 struct IJCompare
3677 {
3678   __host__ __device__
3679   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3680   {
3681     if (t1.get<0>() < t2.get<0>()) return true;
3682     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3683     return false;
3684   }
3685 };
3686 
3687 struct IJEqual
3688 {
3689   __host__ __device__
3690   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3691   {
3692     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
3693     return true;
3694   }
3695 };
3696 
3697 struct IJDiff
3698 {
3699   __host__ __device__
3700   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3701   {
3702     return t1 == t2 ? 0 : 1;
3703   }
3704 };
3705 
3706 struct IJSum
3707 {
3708   __host__ __device__
3709   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3710   {
3711     return t1||t2;
3712   }
3713 };
3714 
3715 #include <thrust/iterator/discard_iterator.h>
3716 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
3717 {
3718   Mat_SeqAIJCUSPARSE                    *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3719   Mat_SeqAIJ                            *a = (Mat_SeqAIJ*)A->data;
3720   THRUSTARRAY                           *cooPerm_v = NULL;
3721   thrust::device_ptr<const PetscScalar> d_v;
3722   CsrMatrix                             *matrix;
3723   PetscErrorCode                        ierr;
3724   PetscInt                              n;
3725 
3726   PetscFunctionBegin;
3727   if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct");
3728   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix");
3729   if (!cusp->cooPerm) {
3730     ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3731     ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3732     PetscFunctionReturn(0);
3733   }
3734   matrix = (CsrMatrix*)cusp->mat->mat;
3735   if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3736   if (!v) {
3737     if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3738     goto finalize;
3739   }
3740   n = cusp->cooPerm->size();
3741   if (isCudaMem(v)) {
3742     d_v = thrust::device_pointer_cast(v);
3743   } else {
3744     cooPerm_v = new THRUSTARRAY(n);
3745     cooPerm_v->assign(v,v+n);
3746     d_v = cooPerm_v->data();
3747     ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
3748   }
3749   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3750   if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
3751     if (cusp->cooPerm_a) { /* there are repeated entries in d_v[], and we need to add these them */
3752       THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
3753       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3754       /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output)
3755         cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[].
3756         cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero.
3757       */
3758       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3759       thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>());
3760       delete cooPerm_w;
3761     } else {
3762       /* all nonzeros in d_v[] are unique entries */
3763       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3764                                                                 matrix->values->begin()));
3765       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3766                                                                 matrix->values->end()));
3767       thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]]  */
3768     }
3769   } else {
3770     if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
3771       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3772       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3773     } else {
3774       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3775                                                                 matrix->values->begin()));
3776       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3777                                                                 matrix->values->end()));
3778       thrust::for_each(zibit,zieit,VecCUDAEquals());
3779     }
3780   }
3781   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3782 finalize:
3783   delete cooPerm_v;
3784   A->offloadmask = PETSC_OFFLOAD_GPU;
3785   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3786   /* shorter version of MatAssemblyEnd_SeqAIJ */
3787   ierr = PetscInfo3(A,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",A->rmap->n,A->cmap->n,a->nz);CHKERRQ(ierr);
3788   ierr = PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
3789   ierr = PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rmax);CHKERRQ(ierr);
3790   a->reallocs         = 0;
3791   A->info.mallocs    += 0;
3792   A->info.nz_unneeded = 0;
3793   A->assembled = A->was_assembled = PETSC_TRUE;
3794   A->num_ass++;
3795   PetscFunctionReturn(0);
3796 }
3797 
3798 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
3799 {
3800   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3801   PetscErrorCode     ierr;
3802 
3803   PetscFunctionBegin;
3804   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3805   if (!cusp) PetscFunctionReturn(0);
3806   if (destroy) {
3807     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr);
3808     delete cusp->csr2csc_i;
3809     cusp->csr2csc_i = NULL;
3810   }
3811   A->transupdated = PETSC_FALSE;
3812   PetscFunctionReturn(0);
3813 }
3814 
3815 #include <thrust/binary_search.h>
3816 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
3817 {
3818   PetscErrorCode     ierr;
3819   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3820   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
3821   PetscInt           cooPerm_n, nzr = 0;
3822   cudaError_t        cerr;
3823 
3824   PetscFunctionBegin;
3825   ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr);
3826   ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr);
3827   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
3828   if (n != cooPerm_n) {
3829     delete cusp->cooPerm;
3830     delete cusp->cooPerm_a;
3831     cusp->cooPerm = NULL;
3832     cusp->cooPerm_a = NULL;
3833   }
3834   if (n) {
3835     THRUSTINTARRAY d_i(n);
3836     THRUSTINTARRAY d_j(n);
3837     THRUSTINTARRAY ii(A->rmap->n);
3838 
3839     if (!cusp->cooPerm)   { cusp->cooPerm   = new THRUSTINTARRAY(n); }
3840     if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); }
3841 
3842     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
3843     d_i.assign(coo_i,coo_i+n);
3844     d_j.assign(coo_j,coo_j+n);
3845 
3846     /* Ex.
3847       n = 6
3848       coo_i = [3,3,1,4,1,4]
3849       coo_j = [3,2,2,5,2,6]
3850     */
3851     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin()));
3852     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end()));
3853 
3854     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3855     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
3856     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */
3857     *cusp->cooPerm_a = d_i; /* copy the sorted array */
3858     THRUSTINTARRAY w = d_j;
3859 
3860     /*
3861       d_i     = [1,1,3,3,4,4]
3862       d_j     = [2,2,2,3,5,6]
3863       cooPerm = [2,4,1,0,3,5]
3864     */
3865     auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */
3866 
3867     /*
3868       d_i     = [1,3,3,4,4,x]
3869                             ^ekey
3870       d_j     = [2,2,3,5,6,x]
3871                            ^nekye
3872     */
3873     if (nekey == ekey) { /* all entries are unique */
3874       delete cusp->cooPerm_a;
3875       cusp->cooPerm_a = NULL;
3876     } else { /* Stefano: I couldn't come up with a more elegant algorithm */
3877       /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */
3878       adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/
3879       adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff());                                              /* w:         [2,2,2,3,5,6] => [2,0,0,1,1,1]*/
3880       (*cusp->cooPerm_a)[0] = 0; /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */
3881       w[0] = 0;
3882       thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); /* cooPerm_a =          [0,0,1,1,1,1]*/
3883       thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/
3884     }
3885     thrust::counting_iterator<PetscInt> search_begin(0);
3886     thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */
3887                         search_begin, search_begin + A->rmap->n,  /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */
3888                         ii.begin()); /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */
3889     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3890 
3891     ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr);
3892     a->singlemalloc = PETSC_FALSE;
3893     a->free_a       = PETSC_TRUE;
3894     a->free_ij      = PETSC_TRUE;
3895     ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr);
3896     a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */
3897     cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
3898     a->nz = a->maxnz = a->i[A->rmap->n];
3899     a->rmax = 0;
3900     ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr);
3901     ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr);
3902     cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
3903     if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); }
3904     if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); }
3905     for (PetscInt i = 0; i < A->rmap->n; i++) {
3906       const PetscInt nnzr = a->i[i+1] - a->i[i];
3907       nzr += (PetscInt)!!(nnzr);
3908       a->ilen[i] = a->imax[i] = nnzr;
3909       a->rmax = PetscMax(a->rmax,nnzr);
3910     }
3911     a->nonzerorowcnt = nzr;
3912     A->preallocated = PETSC_TRUE;
3913     ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr);
3914     ierr = MatMarkDiagonal_SeqAIJ(A);CHKERRQ(ierr);
3915   } else {
3916     ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr);
3917   }
3918   ierr = MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr);
3919 
3920   /* We want to allocate the CUSPARSE struct for matvec now.
3921      The code is so convoluted now that I prefer to copy zeros */
3922   ierr = PetscArrayzero(a->a,a->nz);CHKERRQ(ierr);
3923   ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr);
3924   A->offloadmask = PETSC_OFFLOAD_CPU;
3925   A->nonzerostate++;
3926   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3927   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
3928 
3929   A->assembled = PETSC_FALSE;
3930   A->was_assembled = PETSC_FALSE;
3931   PetscFunctionReturn(0);
3932 }
3933 
3934 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a)
3935 {
3936   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3937   CsrMatrix          *csr;
3938   PetscErrorCode     ierr;
3939 
3940   PetscFunctionBegin;
3941   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3942   PetscValidPointer(a,2);
3943   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3944   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3945   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3946   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3947   csr = (CsrMatrix*)cusp->mat->mat;
3948   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3949   *a = csr->values->data().get();
3950   PetscFunctionReturn(0);
3951 }
3952 
3953 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a)
3954 {
3955   PetscFunctionBegin;
3956   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3957   PetscValidPointer(a,2);
3958   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3959   *a = NULL;
3960   PetscFunctionReturn(0);
3961 }
3962 
3963 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a)
3964 {
3965   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3966   CsrMatrix          *csr;
3967   PetscErrorCode     ierr;
3968 
3969   PetscFunctionBegin;
3970   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3971   PetscValidPointer(a,2);
3972   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3973   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3974   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3975   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3976   csr = (CsrMatrix*)cusp->mat->mat;
3977   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3978   *a = csr->values->data().get();
3979   A->offloadmask = PETSC_OFFLOAD_GPU;
3980   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
3981   PetscFunctionReturn(0);
3982 }
3983 
3984 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a)
3985 {
3986   PetscErrorCode ierr;
3987 
3988   PetscFunctionBegin;
3989   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3990   PetscValidPointer(a,2);
3991   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3992   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3993   *a = NULL;
3994   PetscFunctionReturn(0);
3995 }
3996 
3997 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a)
3998 {
3999   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4000   CsrMatrix          *csr;
4001   PetscErrorCode     ierr;
4002 
4003   PetscFunctionBegin;
4004   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4005   PetscValidPointer(a,2);
4006   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4007   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4008   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4009   csr = (CsrMatrix*)cusp->mat->mat;
4010   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4011   *a = csr->values->data().get();
4012   A->offloadmask = PETSC_OFFLOAD_GPU;
4013   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
4014   PetscFunctionReturn(0);
4015 }
4016 
4017 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a)
4018 {
4019   PetscErrorCode ierr;
4020 
4021   PetscFunctionBegin;
4022   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4023   PetscValidPointer(a,2);
4024   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4025   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
4026   *a = NULL;
4027   PetscFunctionReturn(0);
4028 }
4029 
4030 struct IJCompare4
4031 {
4032   __host__ __device__
4033   inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4034   {
4035     if (t1.get<0>() < t2.get<0>()) return true;
4036     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
4037     return false;
4038   }
4039 };
4040 
4041 struct Shift
4042 {
4043   int _shift;
4044 
4045   Shift(int shift) : _shift(shift) {}
4046   __host__ __device__
4047   inline int operator() (const int &c)
4048   {
4049     return c + _shift;
4050   }
4051 };
4052 
4053 /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */
4054 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
4055 {
4056   PetscErrorCode               ierr;
4057   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c;
4058   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp;
4059   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
4060   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
4061   PetscInt                     Annz,Bnnz;
4062   cusparseStatus_t             stat;
4063   PetscInt                     i,m,n,zero = 0;
4064   cudaError_t                  cerr;
4065 
4066   PetscFunctionBegin;
4067   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4068   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
4069   PetscValidPointer(C,4);
4070   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4071   PetscCheckTypeName(B,MATSEQAIJCUSPARSE);
4072   if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n);
4073   if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
4074   if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4075   if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4076   if (reuse == MAT_INITIAL_MATRIX) {
4077     m     = A->rmap->n;
4078     n     = A->cmap->n + B->cmap->n;
4079     ierr  = MatCreate(PETSC_COMM_SELF,C);CHKERRQ(ierr);
4080     ierr  = MatSetSizes(*C,m,n,m,n);CHKERRQ(ierr);
4081     ierr  = MatSetType(*C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
4082     c     = (Mat_SeqAIJ*)(*C)->data;
4083     Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4084     Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
4085     Ccsr  = new CsrMatrix;
4086     Cmat->cprowIndices      = NULL;
4087     c->compressedrow.use    = PETSC_FALSE;
4088     c->compressedrow.nrows  = 0;
4089     c->compressedrow.i      = NULL;
4090     c->compressedrow.rindex = NULL;
4091     Ccusp->workVector       = NULL;
4092     Ccusp->nrows    = m;
4093     Ccusp->mat      = Cmat;
4094     Ccusp->mat->mat = Ccsr;
4095     Ccsr->num_rows  = m;
4096     Ccsr->num_cols  = n;
4097     stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
4098     stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4099     stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
4100     cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
4101     cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
4102     cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
4103     cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4104     cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4105     cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4106     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4107     ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
4108     ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr);
4109     ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(B);CHKERRQ(ierr);
4110     if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4111     if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4112 
4113     Acsr = (CsrMatrix*)Acusp->mat->mat;
4114     Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4115     Annz = (PetscInt)Acsr->column_indices->size();
4116     Bnnz = (PetscInt)Bcsr->column_indices->size();
4117     c->nz = Annz + Bnnz;
4118     Ccsr->row_offsets = new THRUSTINTARRAY32(m+1);
4119     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4120     Ccsr->values = new THRUSTARRAY(c->nz);
4121     Ccsr->num_entries = c->nz;
4122     Ccusp->cooPerm = new THRUSTINTARRAY(c->nz);
4123     if (c->nz) {
4124       auto Acoo = new THRUSTINTARRAY32(Annz);
4125       auto Bcoo = new THRUSTINTARRAY32(Bnnz);
4126       auto Ccoo = new THRUSTINTARRAY32(c->nz);
4127       THRUSTINTARRAY32 *Aroff,*Broff;
4128 
4129       if (a->compressedrow.use) { /* need full row offset */
4130         if (!Acusp->rowoffsets_gpu) {
4131           Acusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
4132           Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
4133           ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
4134         }
4135         Aroff = Acusp->rowoffsets_gpu;
4136       } else Aroff = Acsr->row_offsets;
4137       if (b->compressedrow.use) { /* need full row offset */
4138         if (!Bcusp->rowoffsets_gpu) {
4139           Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
4140           Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
4141           ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
4142         }
4143         Broff = Bcusp->rowoffsets_gpu;
4144       } else Broff = Bcsr->row_offsets;
4145       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4146       stat = cusparseXcsr2coo(Acusp->handle,
4147                               Aroff->data().get(),
4148                               Annz,
4149                               m,
4150                               Acoo->data().get(),
4151                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4152       stat = cusparseXcsr2coo(Bcusp->handle,
4153                               Broff->data().get(),
4154                               Bnnz,
4155                               m,
4156                               Bcoo->data().get(),
4157                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4158       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
4159       auto Aperm = thrust::make_constant_iterator(1);
4160       auto Bperm = thrust::make_constant_iterator(0);
4161 #if PETSC_PKG_CUDA_VERSION_GE(10,0,0)
4162       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n));
4163       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n));
4164 #else
4165       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
4166       auto Bcib = Bcsr->column_indices->begin();
4167       auto Bcie = Bcsr->column_indices->end();
4168       thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n));
4169 #endif
4170       auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz);
4171       auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm));
4172       auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm));
4173       auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm));
4174       auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm));
4175       auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin()));
4176       auto p1 = Ccusp->cooPerm->begin();
4177       auto p2 = Ccusp->cooPerm->begin();
4178       thrust::advance(p2,Annz);
4179       PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4()));
4180 #if PETSC_PKG_CUDA_VERSION_LT(10,0,0)
4181       thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n));
4182 #endif
4183       auto cci = thrust::make_counting_iterator(zero);
4184       auto cce = thrust::make_counting_iterator(c->nz);
4185 #if 0 //Errors on SUMMIT cuda 11.1.0
4186       PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
4187 #else
4188       auto pred = thrust::identity<int>();
4189       PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred));
4190       PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred));
4191 #endif
4192       stat = cusparseXcoo2csr(Ccusp->handle,
4193                               Ccoo->data().get(),
4194                               c->nz,
4195                               m,
4196                               Ccsr->row_offsets->data().get(),
4197                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4198       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4199       delete wPerm;
4200       delete Acoo;
4201       delete Bcoo;
4202       delete Ccoo;
4203 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4204       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries,
4205                                Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(),
4206                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4207                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4208 #endif
4209       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
4210         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4211         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4212         CsrMatrix *CcsrT = new CsrMatrix;
4213         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4214         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4215 
4216         (*C)->form_explicit_transpose = PETSC_TRUE;
4217         (*C)->transupdated = PETSC_TRUE;
4218         Ccusp->rowoffsets_gpu = NULL;
4219         CmatT->cprowIndices = NULL;
4220         CmatT->mat = CcsrT;
4221         CcsrT->num_rows = n;
4222         CcsrT->num_cols = m;
4223         CcsrT->num_entries = c->nz;
4224 
4225         CcsrT->row_offsets = new THRUSTINTARRAY32(n+1);
4226         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4227         CcsrT->values = new THRUSTARRAY(c->nz);
4228 
4229         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4230         auto rT = CcsrT->row_offsets->begin();
4231         if (AT) {
4232           rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT);
4233           thrust::advance(rT,-1);
4234         }
4235         if (BT) {
4236           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz));
4237           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz));
4238           thrust::copy(titb,tite,rT);
4239         }
4240         auto cT = CcsrT->column_indices->begin();
4241         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT);
4242         if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT);
4243         auto vT = CcsrT->values->begin();
4244         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4245         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4246         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4247 
4248         stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat);
4249         stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4250         stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
4251         cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
4252         cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
4253         cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
4254         cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4255         cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4256         cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4257 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4258         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries,
4259                                  CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(),
4260                                  CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4261                                  CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4262 #endif
4263         Ccusp->matTranspose = CmatT;
4264       }
4265     }
4266 
4267     c->singlemalloc = PETSC_FALSE;
4268     c->free_a       = PETSC_TRUE;
4269     c->free_ij      = PETSC_TRUE;
4270     ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
4271     ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
4272     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
4273       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4274       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4275       ii   = *Ccsr->row_offsets;
4276       jj   = *Ccsr->column_indices;
4277       cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4278       cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4279     } else {
4280       cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4281       cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4282     }
4283     ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
4284     ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
4285     ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
4286     c->maxnz = c->nz;
4287     c->nonzerorowcnt = 0;
4288     c->rmax = 0;
4289     for (i = 0; i < m; i++) {
4290       const PetscInt nn = c->i[i+1] - c->i[i];
4291       c->ilen[i] = c->imax[i] = nn;
4292       c->nonzerorowcnt += (PetscInt)!!nn;
4293       c->rmax = PetscMax(c->rmax,nn);
4294     }
4295     ierr = MatMarkDiagonal_SeqAIJ(*C);CHKERRQ(ierr);
4296     ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
4297     (*C)->nonzerostate++;
4298     ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr);
4299     ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr);
4300     Ccusp->nonzerostate = (*C)->nonzerostate;
4301     (*C)->preallocated  = PETSC_TRUE;
4302   } else {
4303     if ((*C)->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",(*C)->rmap->n,B->rmap->n);
4304     c = (Mat_SeqAIJ*)(*C)->data;
4305     if (c->nz) {
4306       Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4307       if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm");
4308       if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4309       if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate");
4310       ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4311       ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
4312       if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4313       if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4314       Acsr = (CsrMatrix*)Acusp->mat->mat;
4315       Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4316       Ccsr = (CsrMatrix*)Ccusp->mat->mat;
4317       if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %D != %D",Acsr->num_entries,(PetscInt)Acsr->values->size());
4318       if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %D != %D",Bcsr->num_entries,(PetscInt)Bcsr->values->size());
4319       if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D",Ccsr->num_entries,(PetscInt)Ccsr->values->size());
4320       if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D + %D",Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries);
4321       if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %D != %D",(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size());
4322       auto pmid = Ccusp->cooPerm->begin();
4323       thrust::advance(pmid,Acsr->num_entries);
4324       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4325       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(),
4326                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin())));
4327       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(),
4328                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4329       thrust::for_each(zibait,zieait,VecCUDAEquals());
4330       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(),
4331                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4332       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(),
4333                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end())));
4334       thrust::for_each(zibbit,ziebit,VecCUDAEquals());
4335       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE);CHKERRQ(ierr);
4336       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
4337         if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4338         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4339         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4340         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4341         CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat;
4342         auto vT = CcsrT->values->begin();
4343         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4344         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4345         (*C)->transupdated = PETSC_TRUE;
4346       }
4347       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4348     }
4349   }
4350   ierr = PetscObjectStateIncrease((PetscObject)*C);CHKERRQ(ierr);
4351   (*C)->assembled     = PETSC_TRUE;
4352   (*C)->was_assembled = PETSC_FALSE;
4353   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4354   PetscFunctionReturn(0);
4355 }
4356 
4357 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4358 {
4359   PetscErrorCode    ierr;
4360   bool              dmem;
4361   const PetscScalar *av;
4362   cudaError_t       cerr;
4363 
4364   PetscFunctionBegin;
4365   dmem = isCudaMem(v);
4366   ierr = MatSeqAIJCUSPARSEGetArrayRead(A,&av);CHKERRQ(ierr);
4367   if (n && idx) {
4368     THRUSTINTARRAY widx(n);
4369     widx.assign(idx,idx+n);
4370     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
4371 
4372     THRUSTARRAY *w = NULL;
4373     thrust::device_ptr<PetscScalar> dv;
4374     if (dmem) {
4375       dv = thrust::device_pointer_cast(v);
4376     } else {
4377       w = new THRUSTARRAY(n);
4378       dv = w->data();
4379     }
4380     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
4381 
4382     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv));
4383     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n));
4384     thrust::for_each(zibit,zieit,VecCUDAEquals());
4385     if (w) {
4386       cerr = cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4387     }
4388     delete w;
4389   } else {
4390     cerr = cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4391   }
4392   if (!dmem) { ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); }
4393   ierr = MatSeqAIJCUSPARSERestoreArrayRead(A,&av);CHKERRQ(ierr);
4394   PetscFunctionReturn(0);
4395 }
4396