xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 4d3610e326ff6ba11b121cb0a8ecfba2b4024273)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 #define PETSC_SKIP_CXX_COMPLEX_FIX
7 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
8 
9 #include <petscconf.h>
10 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
11 #include <../src/mat/impls/sbaij/seq/sbaij.h>
12 #include <../src/vec/vec/impls/dvecimpl.h>
13 #include <petsc/private/vecimpl.h>
14 #undef VecType
15 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
16 
17 const char *const MatCUSPARSEStorageFormats[]    = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
18 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
19   /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
20     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
21 
22   typedef enum {
23       CUSPARSE_MV_ALG_DEFAULT = 0,
24       CUSPARSE_COOMV_ALG      = 1,
25       CUSPARSE_CSRMV_ALG1     = 2,
26       CUSPARSE_CSRMV_ALG2     = 3
27   } cusparseSpMVAlg_t;
28 
29   typedef enum {
30       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
31       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
32       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
33       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
34       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
35       CUSPARSE_SPMM_ALG_DEFAULT = 0,
36       CUSPARSE_SPMM_COO_ALG1    = 1,
37       CUSPARSE_SPMM_COO_ALG2    = 2,
38       CUSPARSE_SPMM_COO_ALG3    = 3,
39       CUSPARSE_SPMM_COO_ALG4    = 5,
40       CUSPARSE_SPMM_CSR_ALG1    = 4,
41       CUSPARSE_SPMM_CSR_ALG2    = 6,
42   } cusparseSpMMAlg_t;
43 
44   typedef enum {
45       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
46       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
47   } cusparseCsr2CscAlg_t;
48   */
49   const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
50   const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
51   const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
52 #endif
53 
54 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
55 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
56 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
57 
58 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
59 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
60 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
61 
62 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
63 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
64 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
65 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
66 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
67 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
68 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
69 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
70 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
71 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
72 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
73 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
74 
75 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
76 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
77 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
78 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**);
79 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
80 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
81 
82 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
83 {
84   cusparseStatus_t   stat;
85   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
86 
87   PetscFunctionBegin;
88   cusparsestruct->stream = stream;
89   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
90   PetscFunctionReturn(0);
91 }
92 
93 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
94 {
95   cusparseStatus_t   stat;
96   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
97 
98   PetscFunctionBegin;
99   if (cusparsestruct->handle != handle) {
100     if (cusparsestruct->handle) {
101       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
102     }
103     cusparsestruct->handle = handle;
104   }
105   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
106   PetscFunctionReturn(0);
107 }
108 
109 PetscErrorCode MatCUSPARSEClearHandle(Mat A)
110 {
111   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
112 
113   PetscFunctionBegin;
114   if (cusparsestruct->handle) cusparsestruct->handle = 0;
115   PetscFunctionReturn(0);
116 }
117 
118 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
119 {
120   PetscFunctionBegin;
121   *type = MATSOLVERCUSPARSE;
122   PetscFunctionReturn(0);
123 }
124 
125 /*MC
126   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
127   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
128   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
129   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
130   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
131   algorithms are not recommended. This class does NOT support direct solver operations.
132 
133   Level: beginner
134 
135 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
136 M*/
137 
138 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
139 {
140   PetscErrorCode ierr;
141   PetscInt       n = A->rmap->n;
142 
143   PetscFunctionBegin;
144   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
145   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
146   (*B)->factortype = ftype;
147   (*B)->useordering = PETSC_TRUE;
148   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
149 
150   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
151     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
152     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
153     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
154   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
155     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
156     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
157   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
158 
159   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
160   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
161   PetscFunctionReturn(0);
162 }
163 
164 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
165 {
166   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
167 
168   PetscFunctionBegin;
169   switch (op) {
170   case MAT_CUSPARSE_MULT:
171     cusparsestruct->format = format;
172     break;
173   case MAT_CUSPARSE_ALL:
174     cusparsestruct->format = format;
175     break;
176   default:
177     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
178   }
179   PetscFunctionReturn(0);
180 }
181 
182 /*@
183    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
184    operation. Only the MatMult operation can use different GPU storage formats
185    for MPIAIJCUSPARSE matrices.
186    Not Collective
187 
188    Input Parameters:
189 +  A - Matrix of type SEQAIJCUSPARSE
190 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
191 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
192 
193    Output Parameter:
194 
195    Level: intermediate
196 
197 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
198 @*/
199 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
200 {
201   PetscErrorCode ierr;
202 
203   PetscFunctionBegin;
204   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
205   ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
206   PetscFunctionReturn(0);
207 }
208 
209 /*@
210    MatSeqAIJCUSPARSESetGenerateTranspose - Sets the flag to explicitly generate the tranpose matrix before calling MatMultTranspose
211 
212    Collective on mat
213 
214    Input Parameters:
215 +  A - Matrix of type SEQAIJCUSPARSE
216 -  transgen - the boolean flag
217 
218    Level: intermediate
219 
220 .seealso: MATSEQAIJCUSPARSE
221 @*/
222 PetscErrorCode MatSeqAIJCUSPARSESetGenerateTranspose(Mat A,PetscBool transgen)
223 {
224   PetscErrorCode ierr;
225   PetscBool      flg;
226 
227   PetscFunctionBegin;
228   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
229   ierr = PetscObjectTypeCompare(((PetscObject)A),MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
230   if (flg) {
231     Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
232 
233     if (A->factortype) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
234     cusp->transgen = transgen;
235     if (!transgen) { /* need to destroy the transpose matrix if present to prevent from logic errors if transgen is set to true later */
236       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr);
237     }
238   }
239   PetscFunctionReturn(0);
240 }
241 
242 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
243 {
244   PetscErrorCode           ierr;
245   MatCUSPARSEStorageFormat format;
246   PetscBool                flg;
247   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
248 
249   PetscFunctionBegin;
250   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
251   if (A->factortype == MAT_FACTOR_NONE) {
252     PetscBool transgen = cusparsestruct->transgen;
253 
254     ierr = PetscOptionsBool("-mat_cusparse_transgen","Generate explicit transpose for MatMultTranspose","MatSeqAIJCUSPARSESetGenerateTranspose",transgen,&transgen,&flg);CHKERRQ(ierr);
255     if (flg) {ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A,transgen);CHKERRQ(ierr);}
256 
257     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
258                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
259     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);}
260 
261     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
262                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
263     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);}
264    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
265     cusparsestruct->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */
266     ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
267                             "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr);
268     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
269     if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
270 
271     cusparsestruct->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
272     ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
273                             "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr);
274     if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
275 
276     cusparsestruct->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
277     ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
278                             "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr);
279     if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
280    #endif
281   }
282   ierr = PetscOptionsTail();CHKERRQ(ierr);
283   PetscFunctionReturn(0);
284 }
285 
286 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
287 {
288   PetscErrorCode ierr;
289 
290   PetscFunctionBegin;
291   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
292   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
293   PetscFunctionReturn(0);
294 }
295 
296 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
297 {
298   PetscErrorCode ierr;
299 
300   PetscFunctionBegin;
301   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
302   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
303   PetscFunctionReturn(0);
304 }
305 
306 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
307 {
308   PetscErrorCode ierr;
309 
310   PetscFunctionBegin;
311   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
312   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
313   PetscFunctionReturn(0);
314 }
315 
316 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
317 {
318   PetscErrorCode ierr;
319 
320   PetscFunctionBegin;
321   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
322   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
323   PetscFunctionReturn(0);
324 }
325 
326 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
327 {
328   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
329   PetscInt                          n = A->rmap->n;
330   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
331   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
332   cusparseStatus_t                  stat;
333   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
334   const MatScalar                   *aa = a->a,*v;
335   PetscInt                          *AiLo, *AjLo;
336   PetscScalar                       *AALo;
337   PetscInt                          i,nz, nzLower, offset, rowOffset;
338   PetscErrorCode                    ierr;
339   cudaError_t                       cerr;
340 
341   PetscFunctionBegin;
342   if (!n) PetscFunctionReturn(0);
343   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
344     try {
345       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
346       nzLower=n+ai[n]-ai[1];
347 
348       /* Allocate Space for the lower triangular matrix */
349       cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
350       cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);
351       cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
352 
353       /* Fill the lower triangular matrix */
354       AiLo[0]  = (PetscInt) 0;
355       AiLo[n]  = nzLower;
356       AjLo[0]  = (PetscInt) 0;
357       AALo[0]  = (MatScalar) 1.0;
358       v        = aa;
359       vi       = aj;
360       offset   = 1;
361       rowOffset= 1;
362       for (i=1; i<n; i++) {
363         nz = ai[i+1] - ai[i];
364         /* additional 1 for the term on the diagonal */
365         AiLo[i]    = rowOffset;
366         rowOffset += nz+1;
367 
368         ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
369         ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
370 
371         offset      += nz;
372         AjLo[offset] = (PetscInt) i;
373         AALo[offset] = (MatScalar) 1.0;
374         offset      += 1;
375 
376         v  += nz;
377         vi += nz;
378       }
379 
380       /* allocate space for the triangular factor information */
381       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
382 
383       /* Create the matrix description */
384       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
385       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
386      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
387       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
388      #else
389       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
390      #endif
391       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
392       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
393 
394       /* set the operation */
395       loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
396 
397       /* set the matrix */
398       loTriFactor->csrMat = new CsrMatrix;
399       loTriFactor->csrMat->num_rows = n;
400       loTriFactor->csrMat->num_cols = n;
401       loTriFactor->csrMat->num_entries = nzLower;
402 
403       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
404       loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
405 
406       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
407       loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
408 
409       loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
410       loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
411 
412       /* Create the solve analysis information */
413       stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
414     #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
415       stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
416                                      loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
417                                      loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
418                                      loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
419                                      &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
420       cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
421     #endif
422 
423       /* perform the solve analysis */
424       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
425                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
426                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
427                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
428                              #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
429                                ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
430                              #endif
431                               );CHKERRCUSPARSE(stat);
432 
433       /* assign the pointer. Is this really necessary? */
434       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
435 
436       cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
437       cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
438       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
439       ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
440     } catch(char *ex) {
441       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
442     }
443   }
444   PetscFunctionReturn(0);
445 }
446 
447 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
448 {
449   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
450   PetscInt                          n = A->rmap->n;
451   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
452   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
453   cusparseStatus_t                  stat;
454   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
455   const MatScalar                   *aa = a->a,*v;
456   PetscInt                          *AiUp, *AjUp;
457   PetscScalar                       *AAUp;
458   PetscInt                          i,nz, nzUpper, offset;
459   PetscErrorCode                    ierr;
460   cudaError_t                       cerr;
461 
462   PetscFunctionBegin;
463   if (!n) PetscFunctionReturn(0);
464   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
465     try {
466       /* next, figure out the number of nonzeros in the upper triangular matrix. */
467       nzUpper = adiag[0]-adiag[n];
468 
469       /* Allocate Space for the upper triangular matrix */
470       cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
471       cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
472       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
473 
474       /* Fill the upper triangular matrix */
475       AiUp[0]=(PetscInt) 0;
476       AiUp[n]=nzUpper;
477       offset = nzUpper;
478       for (i=n-1; i>=0; i--) {
479         v  = aa + adiag[i+1] + 1;
480         vi = aj + adiag[i+1] + 1;
481 
482         /* number of elements NOT on the diagonal */
483         nz = adiag[i] - adiag[i+1]-1;
484 
485         /* decrement the offset */
486         offset -= (nz+1);
487 
488         /* first, set the diagonal elements */
489         AjUp[offset] = (PetscInt) i;
490         AAUp[offset] = (MatScalar)1./v[nz];
491         AiUp[i]      = AiUp[i+1] - (nz+1);
492 
493         ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
494         ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
495       }
496 
497       /* allocate space for the triangular factor information */
498       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
499 
500       /* Create the matrix description */
501       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
502       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
503      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
504       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
505      #else
506       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
507      #endif
508       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
509       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
510 
511       /* set the operation */
512       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
513 
514       /* set the matrix */
515       upTriFactor->csrMat = new CsrMatrix;
516       upTriFactor->csrMat->num_rows = n;
517       upTriFactor->csrMat->num_cols = n;
518       upTriFactor->csrMat->num_entries = nzUpper;
519 
520       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
521       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
522 
523       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
524       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
525 
526       upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
527       upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
528 
529       /* Create the solve analysis information */
530       stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
531     #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
532       stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
533                                    upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
534                                    upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
535                                    upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
536                                    &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
537       cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
538     #endif
539 
540       /* perform the solve analysis */
541       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
542                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
543                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
544                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
545                              #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
546                                ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
547                              #endif
548                               );CHKERRCUSPARSE(stat);
549 
550       /* assign the pointer. Is this really necessary? */
551       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
552 
553       cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
554       cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
555       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
556       ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
557     } catch(char *ex) {
558       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
559     }
560   }
561   PetscFunctionReturn(0);
562 }
563 
564 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
565 {
566   PetscErrorCode               ierr;
567   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
568   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
569   IS                           isrow = a->row,iscol = a->icol;
570   PetscBool                    row_identity,col_identity;
571   const PetscInt               *r,*c;
572   PetscInt                     n = A->rmap->n;
573 
574   PetscFunctionBegin;
575   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
576   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
577   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
578 
579   cusparseTriFactors->workVector = new THRUSTARRAY(n);
580   cusparseTriFactors->nnz=a->nz;
581 
582   A->offloadmask = PETSC_OFFLOAD_BOTH;
583   /* lower triangular indices */
584   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
585   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
586   if (!row_identity) {
587     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
588     cusparseTriFactors->rpermIndices->assign(r, r+n);
589   }
590   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
591 
592   /* upper triangular indices */
593   ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
594   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
595   if (!col_identity) {
596     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
597     cusparseTriFactors->cpermIndices->assign(c, c+n);
598   }
599 
600   if (!row_identity && !col_identity) {
601     ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr);
602   } else if(!row_identity) {
603     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
604   } else if(!col_identity) {
605     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
606   }
607 
608   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
609   PetscFunctionReturn(0);
610 }
611 
612 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
613 {
614   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
615   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
616   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
617   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
618   cusparseStatus_t                  stat;
619   PetscErrorCode                    ierr;
620   cudaError_t                       cerr;
621   PetscInt                          *AiUp, *AjUp;
622   PetscScalar                       *AAUp;
623   PetscScalar                       *AALo;
624   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
625   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
626   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
627   const MatScalar                   *aa = b->a,*v;
628 
629   PetscFunctionBegin;
630   if (!n) PetscFunctionReturn(0);
631   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
632     try {
633       /* Allocate Space for the upper triangular matrix */
634       cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
635       cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
636       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
637       cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
638 
639       /* Fill the upper triangular matrix */
640       AiUp[0]=(PetscInt) 0;
641       AiUp[n]=nzUpper;
642       offset = 0;
643       for (i=0; i<n; i++) {
644         /* set the pointers */
645         v  = aa + ai[i];
646         vj = aj + ai[i];
647         nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
648 
649         /* first, set the diagonal elements */
650         AjUp[offset] = (PetscInt) i;
651         AAUp[offset] = (MatScalar)1.0/v[nz];
652         AiUp[i]      = offset;
653         AALo[offset] = (MatScalar)1.0/v[nz];
654 
655         offset+=1;
656         if (nz>0) {
657           ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
658           ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
659           for (j=offset; j<offset+nz; j++) {
660             AAUp[j] = -AAUp[j];
661             AALo[j] = AAUp[j]/v[nz];
662           }
663           offset+=nz;
664         }
665       }
666 
667       /* allocate space for the triangular factor information */
668       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
669 
670       /* Create the matrix description */
671       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
672       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
673      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
674       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
675      #else
676       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
677      #endif
678       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
679       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
680 
681       /* set the matrix */
682       upTriFactor->csrMat = new CsrMatrix;
683       upTriFactor->csrMat->num_rows = A->rmap->n;
684       upTriFactor->csrMat->num_cols = A->cmap->n;
685       upTriFactor->csrMat->num_entries = a->nz;
686 
687       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
688       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
689 
690       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
691       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
692 
693       upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
694       upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
695 
696       /* set the operation */
697       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
698 
699       /* Create the solve analysis information */
700       stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
701     #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
702       stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
703                                      upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
704                                      upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
705                                      upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
706                                      &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
707       cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
708     #endif
709 
710       /* perform the solve analysis */
711       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
712                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
713                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
714                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
715                               #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
716                                ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
717                               #endif
718                               );CHKERRCUSPARSE(stat);
719 
720       /* assign the pointer. Is this really necessary? */
721       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
722 
723       /* allocate space for the triangular factor information */
724       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
725 
726       /* Create the matrix description */
727       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
728       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
729      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
730       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
731      #else
732       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
733      #endif
734       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
735       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
736 
737       /* set the operation */
738       loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
739 
740       /* set the matrix */
741       loTriFactor->csrMat = new CsrMatrix;
742       loTriFactor->csrMat->num_rows = A->rmap->n;
743       loTriFactor->csrMat->num_cols = A->cmap->n;
744       loTriFactor->csrMat->num_entries = a->nz;
745 
746       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
747       loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
748 
749       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
750       loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
751 
752       loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
753       loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
754       ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
755 
756       /* Create the solve analysis information */
757       stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
758     #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
759       stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
760                                      loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
761                                      loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
762                                      loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
763                                      &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
764       cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
765     #endif
766 
767       /* perform the solve analysis */
768       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
769                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
770                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
771                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
772                               #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
773                                ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
774                               #endif
775                               );CHKERRCUSPARSE(stat);
776 
777       /* assign the pointer. Is this really necessary? */
778       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
779 
780       A->offloadmask = PETSC_OFFLOAD_BOTH;
781       cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
782       cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
783       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
784       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
785     } catch(char *ex) {
786       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
787     }
788   }
789   PetscFunctionReturn(0);
790 }
791 
792 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
793 {
794   PetscErrorCode               ierr;
795   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
796   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
797   IS                           ip = a->row;
798   const PetscInt               *rip;
799   PetscBool                    perm_identity;
800   PetscInt                     n = A->rmap->n;
801 
802   PetscFunctionBegin;
803   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
804   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
805   cusparseTriFactors->workVector = new THRUSTARRAY(n);
806   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
807 
808   /* lower triangular indices */
809   ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
810   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
811   if (!perm_identity) {
812     IS             iip;
813     const PetscInt *irip;
814 
815     ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr);
816     ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr);
817     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
818     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
819     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
820     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
821     ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr);
822     ierr = ISDestroy(&iip);CHKERRQ(ierr);
823     ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr);
824   }
825   ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
826   PetscFunctionReturn(0);
827 }
828 
829 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
830 {
831   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
832   IS             isrow = b->row,iscol = b->col;
833   PetscBool      row_identity,col_identity;
834   PetscErrorCode ierr;
835 
836   PetscFunctionBegin;
837   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
838   B->offloadmask = PETSC_OFFLOAD_CPU;
839   /* determine which version of MatSolve needs to be used. */
840   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
841   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
842   if (row_identity && col_identity) {
843     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
844     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
845     B->ops->matsolve = NULL;
846     B->ops->matsolvetranspose = NULL;
847   } else {
848     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
849     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
850     B->ops->matsolve = NULL;
851     B->ops->matsolvetranspose = NULL;
852   }
853 
854   /* get the triangular factors */
855   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
856   PetscFunctionReturn(0);
857 }
858 
859 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
860 {
861   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
862   IS             ip = b->row;
863   PetscBool      perm_identity;
864   PetscErrorCode ierr;
865 
866   PetscFunctionBegin;
867   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
868   B->offloadmask = PETSC_OFFLOAD_CPU;
869   /* determine which version of MatSolve needs to be used. */
870   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
871   if (perm_identity) {
872     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
873     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
874     B->ops->matsolve = NULL;
875     B->ops->matsolvetranspose = NULL;
876   } else {
877     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
878     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
879     B->ops->matsolve = NULL;
880     B->ops->matsolvetranspose = NULL;
881   }
882 
883   /* get the triangular factors */
884   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
885   PetscFunctionReturn(0);
886 }
887 
888 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
889 {
890   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
891   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
892   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
893   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
894   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
895   cusparseStatus_t                  stat;
896   cusparseIndexBase_t               indexBase;
897   cusparseMatrixType_t              matrixType;
898   cusparseFillMode_t                fillMode;
899   cusparseDiagType_t                diagType;
900 
901   PetscFunctionBegin;
902 
903   /*********************************************/
904   /* Now the Transpose of the Lower Tri Factor */
905   /*********************************************/
906 
907   /* allocate space for the transpose of the lower triangular factor */
908   loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
909 
910   /* set the matrix descriptors of the lower triangular factor */
911   matrixType = cusparseGetMatType(loTriFactor->descr);
912   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
913   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
914     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
915   diagType = cusparseGetMatDiagType(loTriFactor->descr);
916 
917   /* Create the matrix description */
918   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
919   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
920   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
921   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
922   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
923 
924   /* set the operation */
925   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
926 
927   /* allocate GPU space for the CSC of the lower triangular factor*/
928   loTriFactorT->csrMat = new CsrMatrix;
929   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
930   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
931   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
932   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
933   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
934   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
935 
936   /* compute the transpose of the lower triangular factor, i.e. the CSC */
937 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
938   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
939                                        loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
940                                        loTriFactor->csrMat->values->data().get(),
941                                        loTriFactor->csrMat->row_offsets->data().get(),
942                                        loTriFactor->csrMat->column_indices->data().get(),
943                                        loTriFactorT->csrMat->values->data().get(),
944                                        loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
945                                        CUSPARSE_ACTION_NUMERIC,indexBase,
946                                        CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
947   cudaError_t cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
948 #endif
949 
950   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
951                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
952                           loTriFactor->csrMat->values->data().get(),
953                           loTriFactor->csrMat->row_offsets->data().get(),
954                           loTriFactor->csrMat->column_indices->data().get(),
955                           loTriFactorT->csrMat->values->data().get(),
956                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
957                           loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
958                           CUSPARSE_ACTION_NUMERIC, indexBase,
959                           CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer
960                         #else
961                           loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
962                           CUSPARSE_ACTION_NUMERIC, indexBase
963                         #endif
964                         );CHKERRCUSPARSE(stat);
965 
966   /* Create the solve analysis information */
967   stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
968 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
969   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
970                                 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
971                                 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
972                                 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
973                                 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
974   cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
975 #endif
976 
977   /* perform the solve analysis */
978   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
979                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
980                            loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
981                            loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo
982                           #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
983                            ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
984                           #endif
985                           );CHKERRCUSPARSE(stat);
986 
987   /* assign the pointer. Is this really necessary? */
988   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
989 
990   /*********************************************/
991   /* Now the Transpose of the Upper Tri Factor */
992   /*********************************************/
993 
994   /* allocate space for the transpose of the upper triangular factor */
995   upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
996 
997   /* set the matrix descriptors of the upper triangular factor */
998   matrixType = cusparseGetMatType(upTriFactor->descr);
999   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1000   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1001     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1002   diagType = cusparseGetMatDiagType(upTriFactor->descr);
1003 
1004   /* Create the matrix description */
1005   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
1006   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1007   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1008   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1009   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1010 
1011   /* set the operation */
1012   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1013 
1014   /* allocate GPU space for the CSC of the upper triangular factor*/
1015   upTriFactorT->csrMat = new CsrMatrix;
1016   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1017   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1018   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1019   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1020   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1021   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1022 
1023   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1024 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1025   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1026                                 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1027                                 upTriFactor->csrMat->values->data().get(),
1028                                 upTriFactor->csrMat->row_offsets->data().get(),
1029                                 upTriFactor->csrMat->column_indices->data().get(),
1030                                 upTriFactorT->csrMat->values->data().get(),
1031                                 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1032                                 CUSPARSE_ACTION_NUMERIC,indexBase,
1033                                 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1034   cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1035 #endif
1036 
1037   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1038                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1039                           upTriFactor->csrMat->values->data().get(),
1040                           upTriFactor->csrMat->row_offsets->data().get(),
1041                           upTriFactor->csrMat->column_indices->data().get(),
1042                           upTriFactorT->csrMat->values->data().get(),
1043                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1044                           upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1045                           CUSPARSE_ACTION_NUMERIC, indexBase,
1046                           CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer
1047                         #else
1048                           upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1049                           CUSPARSE_ACTION_NUMERIC, indexBase
1050                         #endif
1051                         );CHKERRCUSPARSE(stat);
1052 
1053   /* Create the solve analysis information */
1054   stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1055   #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1056   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1057                                  upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1058                                  upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1059                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
1060                                  &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1061   cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1062   #endif
1063 
1064   /* perform the solve analysis */
1065   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1066                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1067                            upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1068                            upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo
1069                           #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1070                            ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1071                           #endif
1072                           );CHKERRCUSPARSE(stat);
1073 
1074   /* assign the pointer. Is this really necessary? */
1075   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1076   PetscFunctionReturn(0);
1077 }
1078 
1079 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
1080 {
1081   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1082   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1083   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1084   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1085   cusparseStatus_t             stat;
1086   cusparseIndexBase_t          indexBase;
1087   cudaError_t                  err;
1088   PetscErrorCode               ierr;
1089 
1090   PetscFunctionBegin;
1091   if (!cusparsestruct->transgen || cusparsestruct->matTranspose) PetscFunctionReturn(0);
1092   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1093   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1094   /* create cusparse matrix */
1095   matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1096   stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
1097   indexBase = cusparseGetMatIndexBase(matstruct->descr);
1098   stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
1099   stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1100 
1101   /* set alpha and beta */
1102   err = cudaMalloc((void **)&(matstructT->alpha_one),    sizeof(PetscScalar));CHKERRCUDA(err);
1103   err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1104   err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1105   err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1106   err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1107   err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1108   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1109 
1110   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1111     CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
1112     CsrMatrix *matrixT= new CsrMatrix;
1113     matrixT->num_rows = A->cmap->n;
1114     matrixT->num_cols = A->rmap->n;
1115     matrixT->num_entries = a->nz;
1116     matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1117     matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1118     matrixT->values = new THRUSTARRAY(a->nz);
1119 
1120     cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
1121     cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
1122 
1123     /* compute the transpose, i.e. the CSC */
1124    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1125     stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1126                                   A->cmap->n, matrix->num_entries,
1127                                   matrix->values->data().get(),
1128                                   cusparsestruct->rowoffsets_gpu->data().get(),
1129                                   matrix->column_indices->data().get(),
1130                                   matrixT->values->data().get(),
1131                                   matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1132                                   CUSPARSE_ACTION_NUMERIC,indexBase,
1133                                   cusparsestruct->csr2cscAlg, &cusparsestruct->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1134     err = cudaMalloc(&cusparsestruct->csr2cscBuffer,cusparsestruct->csr2cscBufferSize);CHKERRCUDA(err);
1135    #endif
1136 
1137     stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1138                             A->cmap->n, matrix->num_entries,
1139                             matrix->values->data().get(),
1140                             cusparsestruct->rowoffsets_gpu->data().get(),
1141                             matrix->column_indices->data().get(),
1142                             matrixT->values->data().get(),
1143                           #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1144                             matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1145                             CUSPARSE_ACTION_NUMERIC,indexBase,
1146                             cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer
1147                           #else
1148                             matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1149                             CUSPARSE_ACTION_NUMERIC, indexBase
1150                           #endif
1151                            );CHKERRCUSPARSE(stat);
1152     matstructT->mat = matrixT;
1153 
1154    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1155     stat = cusparseCreateCsr(&matstructT->matDescr,
1156                              matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1157                              matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1158                              matrixT->values->data().get(),
1159                              CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1160                              indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
1161    #endif
1162   } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1163    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1164     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1165    #else
1166     CsrMatrix *temp  = new CsrMatrix;
1167     CsrMatrix *tempT = new CsrMatrix;
1168     /* First convert HYB to CSR */
1169     temp->num_rows = A->rmap->n;
1170     temp->num_cols = A->cmap->n;
1171     temp->num_entries = a->nz;
1172     temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1173     temp->column_indices = new THRUSTINTARRAY32(a->nz);
1174     temp->values = new THRUSTARRAY(a->nz);
1175 
1176     stat = cusparse_hyb2csr(cusparsestruct->handle,
1177                             matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1178                             temp->values->data().get(),
1179                             temp->row_offsets->data().get(),
1180                             temp->column_indices->data().get());CHKERRCUSPARSE(stat);
1181 
1182     /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1183     tempT->num_rows = A->rmap->n;
1184     tempT->num_cols = A->cmap->n;
1185     tempT->num_entries = a->nz;
1186     tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1187     tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1188     tempT->values = new THRUSTARRAY(a->nz);
1189 
1190     stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1191                             temp->num_cols, temp->num_entries,
1192                             temp->values->data().get(),
1193                             temp->row_offsets->data().get(),
1194                             temp->column_indices->data().get(),
1195                             tempT->values->data().get(),
1196                             tempT->column_indices->data().get(),
1197                             tempT->row_offsets->data().get(),
1198                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1199 
1200     /* Last, convert CSC to HYB */
1201     cusparseHybMat_t hybMat;
1202     stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1203     cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1204       CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1205     stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1206                             matstructT->descr, tempT->values->data().get(),
1207                             tempT->row_offsets->data().get(),
1208                             tempT->column_indices->data().get(),
1209                             hybMat, 0, partition);CHKERRCUSPARSE(stat);
1210 
1211     /* assign the pointer */
1212     matstructT->mat = hybMat;
1213     /* delete temporaries */
1214     if (tempT) {
1215       if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1216       if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1217       if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1218       delete (CsrMatrix*) tempT;
1219     }
1220     if (temp) {
1221       if (temp->values) delete (THRUSTARRAY*) temp->values;
1222       if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1223       if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1224       delete (CsrMatrix*) temp;
1225     }
1226    #endif
1227   }
1228   err  = WaitForCUDA();CHKERRCUDA(err);
1229   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1230   ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1231   /* the compressed row indices is not used for matTranspose */
1232   matstructT->cprowIndices = NULL;
1233   /* assign the pointer */
1234   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1235   PetscFunctionReturn(0);
1236 }
1237 
1238 /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1239 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1240 {
1241   PetscInt                              n = xx->map->n;
1242   const PetscScalar                     *barray;
1243   PetscScalar                           *xarray;
1244   thrust::device_ptr<const PetscScalar> bGPU;
1245   thrust::device_ptr<PetscScalar>       xGPU;
1246   cusparseStatus_t                      stat;
1247   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1248   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1249   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1250   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1251   PetscErrorCode                        ierr;
1252   cudaError_t                           cerr;
1253 
1254   PetscFunctionBegin;
1255   /* Analyze the matrix and create the transpose ... on the fly */
1256   if (!loTriFactorT && !upTriFactorT) {
1257     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1258     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1259     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1260   }
1261 
1262   /* Get the GPU pointers */
1263   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1264   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1265   xGPU = thrust::device_pointer_cast(xarray);
1266   bGPU = thrust::device_pointer_cast(barray);
1267 
1268   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1269   /* First, reorder with the row permutation */
1270   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1271                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1272                xGPU);
1273 
1274   /* First, solve U */
1275   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1276                         upTriFactorT->csrMat->num_rows,
1277                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1278                         upTriFactorT->csrMat->num_entries,
1279                       #endif
1280                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1281                         upTriFactorT->csrMat->values->data().get(),
1282                         upTriFactorT->csrMat->row_offsets->data().get(),
1283                         upTriFactorT->csrMat->column_indices->data().get(),
1284                         upTriFactorT->solveInfo,
1285                         xarray, tempGPU->data().get()
1286                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1287                         ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1288                       #endif
1289                         );CHKERRCUSPARSE(stat);
1290 
1291   /* Then, solve L */
1292   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1293                         loTriFactorT->csrMat->num_rows,
1294                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1295                         loTriFactorT->csrMat->num_entries,
1296                       #endif
1297                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1298                         loTriFactorT->csrMat->values->data().get(),
1299                         loTriFactorT->csrMat->row_offsets->data().get(),
1300                         loTriFactorT->csrMat->column_indices->data().get(),
1301                         loTriFactorT->solveInfo,
1302                         tempGPU->data().get(), xarray
1303                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1304                         ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1305                       #endif
1306                         );CHKERRCUSPARSE(stat);
1307 
1308   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1309   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1310                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1311                tempGPU->begin());
1312 
1313   /* Copy the temporary to the full solution. */
1314   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1315 
1316   /* restore */
1317   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1318   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1319   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1320   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1321   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1322   PetscFunctionReturn(0);
1323 }
1324 
1325 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1326 {
1327   const PetscScalar                 *barray;
1328   PetscScalar                       *xarray;
1329   cusparseStatus_t                  stat;
1330   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1331   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1332   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1333   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1334   PetscErrorCode                    ierr;
1335   cudaError_t                       cerr;
1336 
1337   PetscFunctionBegin;
1338   /* Analyze the matrix and create the transpose ... on the fly */
1339   if (!loTriFactorT && !upTriFactorT) {
1340     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1341     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1342     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1343   }
1344 
1345   /* Get the GPU pointers */
1346   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1347   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1348 
1349   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1350   /* First, solve U */
1351   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1352                         upTriFactorT->csrMat->num_rows,
1353                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1354                         upTriFactorT->csrMat->num_entries,
1355                       #endif
1356                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1357                         upTriFactorT->csrMat->values->data().get(),
1358                         upTriFactorT->csrMat->row_offsets->data().get(),
1359                         upTriFactorT->csrMat->column_indices->data().get(),
1360                         upTriFactorT->solveInfo,
1361                         barray, tempGPU->data().get()
1362                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1363                         ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1364                       #endif
1365                         );CHKERRCUSPARSE(stat);
1366 
1367   /* Then, solve L */
1368   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1369                         loTriFactorT->csrMat->num_rows,
1370                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1371                         loTriFactorT->csrMat->num_entries,
1372                       #endif
1373                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1374                         loTriFactorT->csrMat->values->data().get(),
1375                         loTriFactorT->csrMat->row_offsets->data().get(),
1376                         loTriFactorT->csrMat->column_indices->data().get(),
1377                         loTriFactorT->solveInfo,
1378                         tempGPU->data().get(), xarray
1379                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1380                         ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1381                       #endif
1382                         );CHKERRCUSPARSE(stat);
1383 
1384   /* restore */
1385   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1386   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1387   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1388   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1389   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1390   PetscFunctionReturn(0);
1391 }
1392 
1393 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1394 {
1395   const PetscScalar                     *barray;
1396   PetscScalar                           *xarray;
1397   thrust::device_ptr<const PetscScalar> bGPU;
1398   thrust::device_ptr<PetscScalar>       xGPU;
1399   cusparseStatus_t                      stat;
1400   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1401   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1402   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1403   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1404   PetscErrorCode                        ierr;
1405   cudaError_t                           cerr;
1406 
1407   PetscFunctionBegin;
1408 
1409   /* Get the GPU pointers */
1410   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1411   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1412   xGPU = thrust::device_pointer_cast(xarray);
1413   bGPU = thrust::device_pointer_cast(barray);
1414 
1415   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1416   /* First, reorder with the row permutation */
1417   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1418                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1419                tempGPU->begin());
1420 
1421   /* Next, solve L */
1422   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1423                         loTriFactor->csrMat->num_rows,
1424                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1425                         loTriFactor->csrMat->num_entries,
1426                       #endif
1427                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1428                         loTriFactor->csrMat->values->data().get(),
1429                         loTriFactor->csrMat->row_offsets->data().get(),
1430                         loTriFactor->csrMat->column_indices->data().get(),
1431                         loTriFactor->solveInfo,
1432                         tempGPU->data().get(), xarray
1433                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1434                         ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1435                       #endif
1436                         );CHKERRCUSPARSE(stat);
1437 
1438   /* Then, solve U */
1439   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1440                         upTriFactor->csrMat->num_rows,
1441                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1442                         upTriFactor->csrMat->num_entries,
1443                       #endif
1444                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1445                         upTriFactor->csrMat->values->data().get(),
1446                         upTriFactor->csrMat->row_offsets->data().get(),
1447                         upTriFactor->csrMat->column_indices->data().get(),
1448                         upTriFactor->solveInfo,
1449                         xarray, tempGPU->data().get()
1450                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1451                         ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1452                       #endif
1453                         );CHKERRCUSPARSE(stat);
1454 
1455   /* Last, reorder with the column permutation */
1456   thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1457                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
1458                xGPU);
1459 
1460   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1461   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1462   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1463   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1464   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1465   PetscFunctionReturn(0);
1466 }
1467 
1468 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1469 {
1470   const PetscScalar                 *barray;
1471   PetscScalar                       *xarray;
1472   cusparseStatus_t                  stat;
1473   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1474   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1475   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1476   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1477   PetscErrorCode                    ierr;
1478   cudaError_t                       cerr;
1479 
1480   PetscFunctionBegin;
1481   /* Get the GPU pointers */
1482   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1483   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1484 
1485   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1486   /* First, solve L */
1487   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1488                         loTriFactor->csrMat->num_rows,
1489                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1490                         loTriFactor->csrMat->num_entries,
1491                       #endif
1492                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1493                         loTriFactor->csrMat->values->data().get(),
1494                         loTriFactor->csrMat->row_offsets->data().get(),
1495                         loTriFactor->csrMat->column_indices->data().get(),
1496                         loTriFactor->solveInfo,
1497                         barray, tempGPU->data().get()
1498                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1499                         ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1500                       #endif
1501                         );CHKERRCUSPARSE(stat);
1502 
1503   /* Next, solve U */
1504   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1505                         upTriFactor->csrMat->num_rows,
1506                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1507                         upTriFactor->csrMat->num_entries,
1508                       #endif
1509                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1510                         upTriFactor->csrMat->values->data().get(),
1511                         upTriFactor->csrMat->row_offsets->data().get(),
1512                         upTriFactor->csrMat->column_indices->data().get(),
1513                         upTriFactor->solveInfo,
1514                         tempGPU->data().get(), xarray
1515                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1516                         ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1517                       #endif
1518                         );CHKERRCUSPARSE(stat);
1519 
1520   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1521   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1522   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1523   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1524   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1525   PetscFunctionReturn(0);
1526 }
1527 
1528 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1529 {
1530   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1531   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
1532   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1533   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
1534   PetscErrorCode               ierr;
1535   cusparseStatus_t             stat;
1536   cudaError_t                  err;
1537 
1538   PetscFunctionBegin;
1539   if (A->boundtocpu) PetscFunctionReturn(0);
1540   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1541     if (A->was_assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) {
1542       /* Copy values only */
1543       CsrMatrix *matrix,*matrixT;
1544       matrix = (CsrMatrix*)cusparsestruct->mat->mat;
1545 
1546       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1547       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1548       matrix->values->assign(a->a, a->a+a->nz);
1549       err  = WaitForCUDA();CHKERRCUDA(err);
1550       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1551       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
1552       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1553 
1554       /* Update matT when it was built before */
1555       if (cusparsestruct->matTranspose) {
1556         cusparseIndexBase_t indexBase = cusparseGetMatIndexBase(cusparsestruct->mat->descr);
1557         matrixT = (CsrMatrix*)cusparsestruct->matTranspose->mat;
1558         ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1559         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1560         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1561                             A->cmap->n, matrix->num_entries,
1562                             matrix->values->data().get(),
1563                             cusparsestruct->rowoffsets_gpu->data().get(),
1564                             matrix->column_indices->data().get(),
1565                             matrixT->values->data().get(),
1566                           #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1567                             matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1568                             CUSPARSE_ACTION_NUMERIC,indexBase,
1569                             cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer
1570                           #else
1571                             matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1572                             CUSPARSE_ACTION_NUMERIC, indexBase
1573                           #endif
1574                            );CHKERRCUSPARSE(stat);
1575         err  = WaitForCUDA();CHKERRCUDA(err);
1576         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1577         ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1578       }
1579     } else {
1580       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1581       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1582       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr);
1583       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->matTranspose,cusparsestruct->format);CHKERRQ(ierr);
1584       delete cusparsestruct->workVector;
1585       delete cusparsestruct->rowoffsets_gpu;
1586       try {
1587         if (a->compressedrow.use) {
1588           m    = a->compressedrow.nrows;
1589           ii   = a->compressedrow.i;
1590           ridx = a->compressedrow.rindex;
1591         } else {
1592           m    = A->rmap->n;
1593           ii   = a->i;
1594           ridx = NULL;
1595         }
1596         cusparsestruct->nrows = m;
1597 
1598         /* create cusparse matrix */
1599         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1600         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
1601         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
1602         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1603 
1604         err = cudaMalloc((void **)&(matstruct->alpha_one),    sizeof(PetscScalar));CHKERRCUDA(err);
1605         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1606         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1607         err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1608         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1609         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1610         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1611 
1612         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1613         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1614           /* set the matrix */
1615           CsrMatrix *mat= new CsrMatrix;
1616           mat->num_rows = m;
1617           mat->num_cols = A->cmap->n;
1618           mat->num_entries = a->nz;
1619           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1620           mat->row_offsets->assign(ii, ii + m+1);
1621 
1622           mat->column_indices = new THRUSTINTARRAY32(a->nz);
1623           mat->column_indices->assign(a->j, a->j+a->nz);
1624 
1625           mat->values = new THRUSTARRAY(a->nz);
1626           mat->values->assign(a->a, a->a+a->nz);
1627 
1628           /* assign the pointer */
1629           matstruct->mat = mat;
1630          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1631           if (mat->num_rows) { /* cusparse errors on empty matrices! */
1632             stat = cusparseCreateCsr(&matstruct->matDescr,
1633                                     mat->num_rows, mat->num_cols, mat->num_entries,
1634                                     mat->row_offsets->data().get(), mat->column_indices->data().get(),
1635                                     mat->values->data().get(),
1636                                     CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1637                                     CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
1638           }
1639          #endif
1640         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1641          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1642           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1643          #else
1644           CsrMatrix *mat= new CsrMatrix;
1645           mat->num_rows = m;
1646           mat->num_cols = A->cmap->n;
1647           mat->num_entries = a->nz;
1648           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1649           mat->row_offsets->assign(ii, ii + m+1);
1650 
1651           mat->column_indices = new THRUSTINTARRAY32(a->nz);
1652           mat->column_indices->assign(a->j, a->j+a->nz);
1653 
1654           mat->values = new THRUSTARRAY(a->nz);
1655           mat->values->assign(a->a, a->a+a->nz);
1656 
1657           cusparseHybMat_t hybMat;
1658           stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1659           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1660             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1661           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
1662               matstruct->descr, mat->values->data().get(),
1663               mat->row_offsets->data().get(),
1664               mat->column_indices->data().get(),
1665               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1666           /* assign the pointer */
1667           matstruct->mat = hybMat;
1668 
1669           if (mat) {
1670             if (mat->values) delete (THRUSTARRAY*)mat->values;
1671             if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1672             if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1673             delete (CsrMatrix*)mat;
1674           }
1675          #endif
1676         }
1677 
1678         /* assign the compressed row indices */
1679         if (a->compressedrow.use) {
1680           cusparsestruct->workVector = new THRUSTARRAY(m);
1681           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
1682           matstruct->cprowIndices->assign(ridx,ridx+m);
1683           tmp = m;
1684         } else {
1685           cusparsestruct->workVector = NULL;
1686           matstruct->cprowIndices    = NULL;
1687           tmp = 0;
1688         }
1689         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
1690 
1691         /* assign the pointer */
1692         cusparsestruct->mat = matstruct;
1693       } catch(char *ex) {
1694         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1695       }
1696       err  = WaitForCUDA();CHKERRCUDA(err);
1697       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1698       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1699       cusparsestruct->nonzerostate = A->nonzerostate;
1700     }
1701     A->offloadmask = PETSC_OFFLOAD_BOTH;
1702   }
1703   PetscFunctionReturn(0);
1704 }
1705 
1706 struct VecCUDAPlusEquals
1707 {
1708   template <typename Tuple>
1709   __host__ __device__
1710   void operator()(Tuple t)
1711   {
1712     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1713   }
1714 };
1715 
1716 struct VecCUDAEqualsReverse
1717 {
1718   template <typename Tuple>
1719   __host__ __device__
1720   void operator()(Tuple t)
1721   {
1722     thrust::get<0>(t) = thrust::get<1>(t);
1723   }
1724 };
1725 
1726 struct MatMatCusparse {
1727   PetscBool            cisdense;
1728   PetscScalar          *Bt;
1729   Mat                  X;
1730 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1731   PetscBool            initialized;   /* C = alpha op(A) op(B) + beta C */
1732   cusparseDnMatDescr_t matBDescr;
1733   cusparseDnMatDescr_t matCDescr;
1734   size_t               spmmBufferSize;
1735   void                 *spmmBuffer;
1736   PetscInt             Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
1737 #endif
1738 };
1739 
1740 static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
1741 {
1742   PetscErrorCode ierr;
1743   MatMatCusparse *mmdata = (MatMatCusparse *)data;
1744   cudaError_t    cerr;
1745 
1746   PetscFunctionBegin;
1747   cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr);
1748  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1749   cusparseStatus_t stat;
1750   if (mmdata->matBDescr)  {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat);}
1751   if (mmdata->matCDescr)  {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat);}
1752   if (mmdata->spmmBuffer) {cerr = cudaFree(mmdata->spmmBuffer);CHKERRCUDA(cerr);}
1753  #endif
1754   ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr);
1755   ierr = PetscFree(data);CHKERRQ(ierr);
1756   PetscFunctionReturn(0);
1757 }
1758 
1759 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
1760 
1761 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1762 {
1763   Mat_Product                  *product = C->product;
1764   Mat                          A,B;
1765   PetscInt                     m,n,blda,clda;
1766   PetscBool                    flg,biscuda;
1767   Mat_SeqAIJCUSPARSE           *cusp;
1768   cusparseStatus_t             stat;
1769   cusparseOperation_t          opA;
1770   const PetscScalar            *barray;
1771   PetscScalar                  *carray;
1772   PetscErrorCode               ierr;
1773   MatMatCusparse               *mmdata;
1774   Mat_SeqAIJCUSPARSEMultStruct *mat;
1775   CsrMatrix                    *csrmat;
1776   cudaError_t                  cerr;
1777 
1778   PetscFunctionBegin;
1779   MatCheckProduct(C,1);
1780   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
1781   mmdata = (MatMatCusparse*)product->data;
1782   A    = product->A;
1783   B    = product->B;
1784   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
1785   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
1786   /* currently CopyToGpu does not copy if the matrix is bound to CPU
1787      Instead of silently accepting the wrong answer, I prefer to raise the error */
1788   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
1789   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1790   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1791   switch (product->type) {
1792   case MATPRODUCT_AB:
1793   case MATPRODUCT_PtAP:
1794     mat = cusp->mat;
1795     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1796     m   = A->rmap->n;
1797     n   = B->cmap->n;
1798     break;
1799   case MATPRODUCT_AtB:
1800     if (!cusp->transgen) {
1801       mat = cusp->mat;
1802       opA = CUSPARSE_OPERATION_TRANSPOSE;
1803     } else {
1804       ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1805       mat  = cusp->matTranspose;
1806       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
1807     }
1808     m = A->cmap->n;
1809     n = B->cmap->n;
1810     break;
1811   case MATPRODUCT_ABt:
1812   case MATPRODUCT_RARt:
1813     mat = cusp->mat;
1814     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1815     m   = A->rmap->n;
1816     n   = B->rmap->n;
1817     break;
1818   default:
1819     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
1820   }
1821   if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct");
1822   csrmat = (CsrMatrix*)mat->mat;
1823   /* if the user passed a CPU matrix, copy the data to the GPU */
1824   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr);
1825   if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);}
1826   ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr);
1827 
1828   ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr);
1829   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
1830     ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
1831     ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr);
1832   } else {
1833     ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr);
1834     ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr);
1835   }
1836 
1837   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1838  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1839   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
1840   /* (re)allcoate spmmBuffer if not initialized or LDAs are different */
1841   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
1842     if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;}
1843     if (!mmdata->matBDescr) {
1844       stat         = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
1845       mmdata->Blda = blda;
1846     }
1847 
1848     if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;}
1849     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
1850       stat         = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
1851       mmdata->Clda = clda;
1852     }
1853 
1854     if (!mat->matDescr) {
1855       stat = cusparseCreateCsr(&mat->matDescr,
1856                               csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
1857                               csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
1858                               csrmat->values->data().get(),
1859                               CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1860                               CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
1861     }
1862     stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
1863                                    mat->matDescr,mmdata->matBDescr,mat->beta_zero,
1864                                    mmdata->matCDescr,cusparse_scalartype,
1865                                    cusp->spmmAlg,&mmdata->spmmBufferSize);CHKERRCUSPARSE(stat);
1866     if (mmdata->spmmBuffer) {cerr = cudaFree(mmdata->spmmBuffer);CHKERRCUDA(cerr);}
1867     cerr = cudaMalloc(&mmdata->spmmBuffer,mmdata->spmmBufferSize);CHKERRCUDA(cerr);
1868     mmdata->initialized = PETSC_TRUE;
1869   } else {
1870     /* to be safe, always update pointers of the mats */
1871     stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat);
1872     stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat);
1873     stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat);
1874   }
1875 
1876   /* do cusparseSpMM, which supports transpose on B */
1877   stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
1878                       mat->matDescr,mmdata->matBDescr,mat->beta_zero,
1879                       mmdata->matCDescr,cusparse_scalartype,
1880                       cusp->spmmAlg,mmdata->spmmBuffer);CHKERRCUSPARSE(stat);
1881  #else
1882   PetscInt k;
1883   /* cusparseXcsrmm does not support transpose on B */
1884   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
1885     cublasHandle_t cublasv2handle;
1886     cublasStatus_t cerr;
1887 
1888     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
1889     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
1890                        B->cmap->n,B->rmap->n,
1891                        &PETSC_CUSPARSE_ONE ,barray,blda,
1892                        &PETSC_CUSPARSE_ZERO,barray,blda,
1893                        mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr);
1894     blda = B->cmap->n;
1895     k    = B->cmap->n;
1896   } else {
1897     k    = B->rmap->n;
1898   }
1899 
1900   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
1901   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
1902                            csrmat->num_entries,mat->alpha_one,mat->descr,
1903                            csrmat->values->data().get(),
1904                            csrmat->row_offsets->data().get(),
1905                            csrmat->column_indices->data().get(),
1906                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
1907                            carray,clda);CHKERRCUSPARSE(stat);
1908  #endif
1909   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1910   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1911   ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr);
1912   ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr);
1913   if (product->type == MATPRODUCT_RARt) {
1914     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
1915     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
1916   } else if (product->type == MATPRODUCT_PtAP) {
1917     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
1918     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
1919   } else {
1920     ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr);
1921   }
1922   if (mmdata->cisdense) {
1923     ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr);
1924   }
1925   if (!biscuda) {
1926     ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
1927   }
1928   PetscFunctionReturn(0);
1929 }
1930 
1931 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1932 {
1933   Mat_Product        *product = C->product;
1934   Mat                A,B;
1935   PetscInt           m,n;
1936   PetscBool          cisdense,flg;
1937   PetscErrorCode     ierr;
1938   MatMatCusparse     *mmdata;
1939   Mat_SeqAIJCUSPARSE *cusp;
1940 
1941   PetscFunctionBegin;
1942   MatCheckProduct(C,1);
1943   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
1944   A    = product->A;
1945   B    = product->B;
1946   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
1947   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
1948   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1949   if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
1950   switch (product->type) {
1951   case MATPRODUCT_AB:
1952     m = A->rmap->n;
1953     n = B->cmap->n;
1954     break;
1955   case MATPRODUCT_AtB:
1956     m = A->cmap->n;
1957     n = B->cmap->n;
1958     break;
1959   case MATPRODUCT_ABt:
1960     m = A->rmap->n;
1961     n = B->rmap->n;
1962     break;
1963   case MATPRODUCT_PtAP:
1964     m = B->cmap->n;
1965     n = B->cmap->n;
1966     break;
1967   case MATPRODUCT_RARt:
1968     m = B->rmap->n;
1969     n = B->rmap->n;
1970     break;
1971   default:
1972     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
1973   }
1974   ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
1975   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
1976   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr);
1977   ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr);
1978 
1979   /* product data */
1980   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
1981   mmdata->cisdense = cisdense;
1982  #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
1983   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
1984   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
1985     cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
1986   }
1987  #endif
1988   /* for these products we need intermediate storage */
1989   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
1990     ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr);
1991     ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr);
1992     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
1993       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr);
1994     } else {
1995       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr);
1996     }
1997   }
1998   C->product->data    = mmdata;
1999   C->product->destroy = MatDestroy_MatMatCusparse;
2000 
2001   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2002   PetscFunctionReturn(0);
2003 }
2004 
2005 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
2006 
2007 /* handles dense B */
2008 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat C)
2009 {
2010   Mat_Product    *product = C->product;
2011   PetscErrorCode ierr;
2012 
2013   PetscFunctionBegin;
2014   MatCheckProduct(C,1);
2015   if (!product->A) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing A");
2016   if (product->A->boundtocpu) {
2017     ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(C);CHKERRQ(ierr);
2018     PetscFunctionReturn(0);
2019   }
2020   switch (product->type) {
2021   case MATPRODUCT_AB:
2022   case MATPRODUCT_AtB:
2023   case MATPRODUCT_ABt:
2024   case MATPRODUCT_PtAP:
2025   case MATPRODUCT_RARt:
2026     C->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2027   default:
2028     break;
2029   }
2030   PetscFunctionReturn(0);
2031 }
2032 
2033 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2034 {
2035   PetscErrorCode ierr;
2036 
2037   PetscFunctionBegin;
2038   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2039   PetscFunctionReturn(0);
2040 }
2041 
2042 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
2043 {
2044   PetscErrorCode ierr;
2045 
2046   PetscFunctionBegin;
2047   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2048   PetscFunctionReturn(0);
2049 }
2050 
2051 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2052 {
2053   PetscErrorCode ierr;
2054 
2055   PetscFunctionBegin;
2056   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
2057   PetscFunctionReturn(0);
2058 }
2059 
2060 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2061 {
2062   PetscErrorCode ierr;
2063 
2064   PetscFunctionBegin;
2065   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
2066   PetscFunctionReturn(0);
2067 }
2068 
2069 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2070 {
2071   PetscErrorCode ierr;
2072 
2073   PetscFunctionBegin;
2074   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2075   PetscFunctionReturn(0);
2076 }
2077 
2078 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
2079 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
2080 {
2081   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
2082   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
2083   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
2084   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
2085   PetscErrorCode               ierr;
2086   cudaError_t                  cerr;
2087   cusparseStatus_t             stat;
2088   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2089   PetscBool                    compressed;
2090 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2091   PetscInt                     nx,ny;
2092 #endif
2093 
2094   PetscFunctionBegin;
2095   if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Hermitian and not transpose not supported");
2096   if (!a->nonzerorowcnt) {
2097     if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);}
2098     PetscFunctionReturn(0);
2099   }
2100   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
2101   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2102   if (!trans) {
2103     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2104   } else {
2105     if (herm || !cusparsestruct->transgen) {
2106       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
2107       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2108     } else {
2109       if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);}
2110       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
2111     }
2112   }
2113   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
2114   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
2115 
2116   try {
2117     ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
2118     if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */
2119     else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */
2120 
2121     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2122     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2123       /* z = A x + beta y.
2124          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
2125          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
2126       */
2127       xptr = xarray;
2128       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
2129       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
2130      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2131       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
2132           allocated to accommodate different uses. So we get the length info directly from mat.
2133        */
2134       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2135         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2136         nx = mat->num_cols;
2137         ny = mat->num_rows;
2138       }
2139      #endif
2140     } else {
2141       /* z = A^T x + beta y
2142          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
2143          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
2144        */
2145       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
2146       dptr = zarray;
2147       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
2148       if (compressed) { /* Scatter x to work vector */
2149         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
2150         thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
2151                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2152                          VecCUDAEqualsReverse());
2153       }
2154      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2155       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2156         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2157         nx = mat->num_rows;
2158         ny = mat->num_cols;
2159       }
2160      #endif
2161     }
2162 
2163     /* csr_spmv does y = alpha op(A) x + beta y */
2164     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2165      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2166       if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
2167       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
2168         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2169         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2170         stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
2171                                 matstruct->matDescr,
2172                                 matstruct->cuSpMV[opA].vecXDescr, beta,
2173                                 matstruct->cuSpMV[opA].vecYDescr,
2174                                 cusparse_scalartype,
2175                                 cusparsestruct->spmvAlg,
2176                                 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat);
2177         cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr);
2178 
2179         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
2180       } else {
2181         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
2182         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat);
2183         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat);
2184       }
2185 
2186       stat = cusparseSpMV(cusparsestruct->handle, opA,
2187                                matstruct->alpha_one,
2188                                matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEGenerateTransposeForMult() */
2189                                matstruct->cuSpMV[opA].vecXDescr,
2190                                beta,
2191                                matstruct->cuSpMV[opA].vecYDescr,
2192                                cusparse_scalartype,
2193                                cusparsestruct->spmvAlg,
2194                                matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat);
2195      #else
2196       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2197       stat = cusparse_csr_spmv(cusparsestruct->handle, opA,
2198                                mat->num_rows, mat->num_cols,
2199                                mat->num_entries, matstruct->alpha_one, matstruct->descr,
2200                                mat->values->data().get(), mat->row_offsets->data().get(),
2201                                mat->column_indices->data().get(), xptr, beta,
2202                                dptr);CHKERRCUSPARSE(stat);
2203      #endif
2204     } else {
2205       if (cusparsestruct->nrows) {
2206        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2207         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2208        #else
2209         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
2210         stat = cusparse_hyb_spmv(cusparsestruct->handle, opA,
2211                                  matstruct->alpha_one, matstruct->descr, hybMat,
2212                                  xptr, beta,
2213                                  dptr);CHKERRCUSPARSE(stat);
2214        #endif
2215       }
2216     }
2217     cerr = WaitForCUDA();CHKERRCUDA(cerr);
2218     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2219 
2220     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2221       if (yy) { /* MatMultAdd: zz = A*xx + yy */
2222         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
2223           ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */
2224         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
2225           ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
2226         }
2227       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
2228         ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
2229       }
2230 
2231       /* ScatterAdd the result from work vector into the full vector when A is compressed */
2232       if (compressed) {
2233         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
2234 
2235         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2236         thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
2237                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2238                          VecCUDAPlusEquals());
2239         cerr = WaitForCUDA();CHKERRCUDA(cerr);
2240         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2241       }
2242     } else {
2243       if (yy && yy != zz) {
2244         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
2245       }
2246     }
2247     ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
2248     if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);}
2249     else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);}
2250   } catch(char *ex) {
2251     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
2252   }
2253   if (yy) {
2254     ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
2255   } else {
2256     ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr);
2257   }
2258   PetscFunctionReturn(0);
2259 }
2260 
2261 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2262 {
2263   PetscErrorCode ierr;
2264 
2265   PetscFunctionBegin;
2266   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2267   PetscFunctionReturn(0);
2268 }
2269 
2270 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
2271 {
2272   PetscErrorCode ierr;
2273 
2274   PetscFunctionBegin;
2275   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
2276   if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0);
2277   if (A->factortype == MAT_FACTOR_NONE) {
2278     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2279   }
2280   PetscFunctionReturn(0);
2281 }
2282 
2283 /* --------------------------------------------------------------------------------*/
2284 /*@
2285    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
2286    (the default parallel PETSc format). This matrix will ultimately pushed down
2287    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
2288    assembly performance the user should preallocate the matrix storage by setting
2289    the parameter nz (or the array nnz).  By setting these parameters accurately,
2290    performance during matrix assembly can be increased by more than a factor of 50.
2291 
2292    Collective
2293 
2294    Input Parameters:
2295 +  comm - MPI communicator, set to PETSC_COMM_SELF
2296 .  m - number of rows
2297 .  n - number of columns
2298 .  nz - number of nonzeros per row (same for all rows)
2299 -  nnz - array containing the number of nonzeros in the various rows
2300          (possibly different for each row) or NULL
2301 
2302    Output Parameter:
2303 .  A - the matrix
2304 
2305    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
2306    MatXXXXSetPreallocation() paradgm instead of this routine directly.
2307    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
2308 
2309    Notes:
2310    If nnz is given then nz is ignored
2311 
2312    The AIJ format (also called the Yale sparse matrix format or
2313    compressed row storage), is fully compatible with standard Fortran 77
2314    storage.  That is, the stored row and column indices can begin at
2315    either one (as in Fortran) or zero.  See the users' manual for details.
2316 
2317    Specify the preallocated storage with either nz or nnz (not both).
2318    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
2319    allocation.  For large problems you MUST preallocate memory or you
2320    will get TERRIBLE performance, see the users' manual chapter on matrices.
2321 
2322    By default, this format uses inodes (identical nodes) when possible, to
2323    improve numerical efficiency of matrix-vector products and solves. We
2324    search for consecutive rows with the same nonzero structure, thereby
2325    reusing matrix information to achieve increased efficiency.
2326 
2327    Level: intermediate
2328 
2329 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
2330 @*/
2331 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
2332 {
2333   PetscErrorCode ierr;
2334 
2335   PetscFunctionBegin;
2336   ierr = MatCreate(comm,A);CHKERRQ(ierr);
2337   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
2338   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2339   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
2340   PetscFunctionReturn(0);
2341 }
2342 
2343 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
2344 {
2345   PetscErrorCode ierr;
2346 
2347   PetscFunctionBegin;
2348   if (A->factortype == MAT_FACTOR_NONE) {
2349     ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
2350   } else {
2351     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
2352   }
2353   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
2354   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
2355   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
2356   ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
2357   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
2358   PetscFunctionReturn(0);
2359 }
2360 
2361 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
2362 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
2363 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
2364 {
2365   PetscErrorCode ierr;
2366 
2367   PetscFunctionBegin;
2368   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
2369   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
2370   PetscFunctionReturn(0);
2371 }
2372 
2373 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
2374 {
2375   PetscErrorCode ierr;
2376 
2377   PetscFunctionBegin;
2378   if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0);
2379   /* Currently, there is no case in which an AIJCUSPARSE matrix ever has its offloadmask set to PETS_OFFLOAD_GPU.
2380      If this changes, we need to implement a routine to update the CPU (host) version of the matrix from the GPU one.
2381      Right now, for safety we simply check for PETSC_OFFLOAD_GPU and have MatBindToCPU() error in this case.
2382      TODO: Add MatAIJCUSPARSECopyFromGPU() and make MatBindToCPU() functional for AIJCUSPARSE matries;
2383            can follow the example of MatBindToCPU_SeqAIJViennaCL(). */
2384   if (flg && A->offloadmask == PETSC_OFFLOAD_GPU) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"PETSC_OFFLOAD_GPU should not happen. Please report your use case to petsc-dev@mcs.anl.gov");
2385   /* TODO: add support for this? */
2386   if (flg) {
2387     A->ops->mult                      = MatMult_SeqAIJ;
2388     A->ops->multadd                   = MatMultAdd_SeqAIJ;
2389     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
2390     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
2391     A->ops->multhermitiantranspose    = NULL;
2392     A->ops->multhermitiantransposeadd = NULL;
2393     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
2394     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
2395   } else {
2396     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
2397     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
2398     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
2399     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
2400     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
2401     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
2402     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
2403     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
2404   }
2405   A->boundtocpu = flg;
2406   PetscFunctionReturn(0);
2407 }
2408 
2409 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
2410 {
2411   PetscErrorCode   ierr;
2412   cusparseStatus_t stat;
2413   Mat              B;
2414 
2415   PetscFunctionBegin;
2416   if (reuse == MAT_INITIAL_MATRIX) {
2417     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
2418   } else if (reuse == MAT_REUSE_MATRIX) {
2419     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
2420   }
2421   B = *newmat;
2422 
2423   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
2424   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
2425 
2426   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
2427     if (B->factortype == MAT_FACTOR_NONE) {
2428       Mat_SeqAIJCUSPARSE *spptr;
2429 
2430       ierr = PetscNew(&spptr);CHKERRQ(ierr);
2431       spptr->format = MAT_CUSPARSE_CSR;
2432       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
2433       B->spptr = spptr;
2434     } else {
2435       Mat_SeqAIJCUSPARSETriFactors *spptr;
2436 
2437       ierr = PetscNew(&spptr);CHKERRQ(ierr);
2438       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
2439       B->spptr = spptr;
2440     }
2441     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
2442   }
2443   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
2444   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
2445   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
2446   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
2447   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
2448 
2449   ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr);
2450   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2451   ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
2452   PetscFunctionReturn(0);
2453 }
2454 
2455 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
2456 {
2457   PetscErrorCode ierr;
2458 
2459   PetscFunctionBegin;
2460   ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr);
2461   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
2462   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
2463   ierr = PetscObjectOptionsBegin((PetscObject)B);CHKERRQ(ierr);
2464   ierr = MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionsObject,B);CHKERRQ(ierr);
2465   ierr = PetscOptionsEnd();CHKERRQ(ierr);
2466   PetscFunctionReturn(0);
2467 }
2468 
2469 /*MC
2470    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
2471 
2472    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
2473    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
2474    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
2475 
2476    Options Database Keys:
2477 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
2478 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
2479 -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
2480 
2481   Level: beginner
2482 
2483 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
2484 M*/
2485 
2486 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);
2487 
2488 
2489 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
2490 {
2491   PetscErrorCode ierr;
2492 
2493   PetscFunctionBegin;
2494   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
2495   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
2496   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
2497   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
2498   PetscFunctionReturn(0);
2499 }
2500 
2501 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
2502 {
2503   PetscErrorCode   ierr;
2504   cusparseStatus_t stat;
2505   cusparseHandle_t handle;
2506 
2507   PetscFunctionBegin;
2508   if (*cusparsestruct) {
2509     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr);
2510     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr);
2511     delete (*cusparsestruct)->workVector;
2512     delete (*cusparsestruct)->rowoffsets_gpu;
2513     if (handle = (*cusparsestruct)->handle) {stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);}
2514    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2515     cudaError_t cerr = cudaFree((*cusparsestruct)->csr2cscBuffer);CHKERRCUDA(cerr);
2516    #endif
2517     ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr);
2518   }
2519   PetscFunctionReturn(0);
2520 }
2521 
2522 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
2523 {
2524   PetscFunctionBegin;
2525   if (*mat) {
2526     delete (*mat)->values;
2527     delete (*mat)->column_indices;
2528     delete (*mat)->row_offsets;
2529     delete *mat;
2530     *mat = 0;
2531   }
2532   PetscFunctionReturn(0);
2533 }
2534 
2535 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
2536 {
2537   cusparseStatus_t stat;
2538   PetscErrorCode   ierr;
2539 
2540   PetscFunctionBegin;
2541   if (*trifactor) {
2542     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
2543     if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
2544     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
2545    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2546     cudaError_t cerr;
2547     if ((*trifactor)->solveBuffer)   {cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);}
2548     if ((*trifactor)->csr2cscBuffer) {cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);}
2549    #endif
2550     delete *trifactor;
2551     *trifactor = 0;
2552   }
2553   PetscFunctionReturn(0);
2554 }
2555 
2556 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
2557 {
2558   CsrMatrix        *mat;
2559   cusparseStatus_t stat;
2560   cudaError_t      err;
2561 
2562   PetscFunctionBegin;
2563   if (*matstruct) {
2564     if ((*matstruct)->mat) {
2565       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
2566        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2567         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2568        #else
2569         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
2570         stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
2571        #endif
2572       } else {
2573         mat = (CsrMatrix*)(*matstruct)->mat;
2574         CsrMatrix_Destroy(&mat);
2575       }
2576     }
2577     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
2578     delete (*matstruct)->cprowIndices;
2579     if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); }
2580     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
2581     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
2582 
2583    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2584     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
2585     if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);}
2586     for (int i=0; i<3; i++) {
2587       if (mdata->cuSpMV[i].initialized) {
2588         err  = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err);
2589         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat);
2590         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat);
2591       }
2592     }
2593    #endif
2594     delete *matstruct;
2595     *matstruct = 0;
2596   }
2597   PetscFunctionReturn(0);
2598 }
2599 
2600 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors)
2601 {
2602   PetscErrorCode ierr;
2603 
2604   PetscFunctionBegin;
2605   if (*trifactors) {
2606     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr);
2607     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr);
2608     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr);
2609     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr);
2610     delete (*trifactors)->rpermIndices;
2611     delete (*trifactors)->cpermIndices;
2612     delete (*trifactors)->workVector;
2613     (*trifactors)->rpermIndices = 0;
2614     (*trifactors)->cpermIndices = 0;
2615     (*trifactors)->workVector = 0;
2616   }
2617   PetscFunctionReturn(0);
2618 }
2619 
2620 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
2621 {
2622   PetscErrorCode   ierr;
2623   cusparseHandle_t handle;
2624   cusparseStatus_t stat;
2625 
2626   PetscFunctionBegin;
2627   if (*trifactors) {
2628     ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr);
2629     if (handle = (*trifactors)->handle) {
2630       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
2631     }
2632     ierr = PetscFree(*trifactors);CHKERRQ(ierr);
2633   }
2634   PetscFunctionReturn(0);
2635 }
2636