xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision efbe7e8a80d07327753dbe0b33efee01e046af3f)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 #define PETSC_SKIP_CXX_COMPLEX_FIX
7 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
8 
9 #include <petscconf.h>
10 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
11 #include <../src/mat/impls/sbaij/seq/sbaij.h>
12 #include <../src/vec/vec/impls/dvecimpl.h>
13 #include <petsc/private/vecimpl.h>
14 #undef VecType
15 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
16 
17 const char *const MatCUSPARSEStorageFormats[]    = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
18 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
19   /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
20     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
21 
22   typedef enum {
23       CUSPARSE_MV_ALG_DEFAULT = 0,
24       CUSPARSE_COOMV_ALG      = 1,
25       CUSPARSE_CSRMV_ALG1     = 2,
26       CUSPARSE_CSRMV_ALG2     = 3
27   } cusparseSpMVAlg_t;
28 
29   typedef enum {
30       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
31       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
32       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
33       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
34       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
35       CUSPARSE_SPMM_ALG_DEFAULT = 0,
36       CUSPARSE_SPMM_COO_ALG1    = 1,
37       CUSPARSE_SPMM_COO_ALG2    = 2,
38       CUSPARSE_SPMM_COO_ALG3    = 3,
39       CUSPARSE_SPMM_COO_ALG4    = 5,
40       CUSPARSE_SPMM_CSR_ALG1    = 4,
41       CUSPARSE_SPMM_CSR_ALG2    = 6,
42   } cusparseSpMMAlg_t;
43 
44   typedef enum {
45       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
46       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
47   } cusparseCsr2CscAlg_t;
48   */
49   const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
50   const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
51   const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
52 #endif
53 
54 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
55 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
56 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
57 
58 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
59 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
60 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
61 
62 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
63 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
64 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
65 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
66 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
67 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
68 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
69 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
70 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
71 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
72 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
73 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
74 
75 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
76 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
77 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
78 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**);
79 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
80 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
81 
82 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
83 {
84   cusparseStatus_t   stat;
85   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
86 
87   PetscFunctionBegin;
88   cusparsestruct->stream = stream;
89   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
90   PetscFunctionReturn(0);
91 }
92 
93 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
94 {
95   cusparseStatus_t   stat;
96   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
97 
98   PetscFunctionBegin;
99   if (cusparsestruct->handle != handle) {
100     if (cusparsestruct->handle) {
101       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
102     }
103     cusparsestruct->handle = handle;
104   }
105   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
106   PetscFunctionReturn(0);
107 }
108 
109 PetscErrorCode MatCUSPARSEClearHandle(Mat A)
110 {
111   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
112 
113   PetscFunctionBegin;
114   if (cusparsestruct->handle) cusparsestruct->handle = 0;
115   PetscFunctionReturn(0);
116 }
117 
118 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
119 {
120   PetscFunctionBegin;
121   *type = MATSOLVERCUSPARSE;
122   PetscFunctionReturn(0);
123 }
124 
125 /*MC
126   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
127   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
128   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
129   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
130   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
131   algorithms are not recommended. This class does NOT support direct solver operations.
132 
133   Level: beginner
134 
135 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
136 M*/
137 
138 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
139 {
140   PetscErrorCode ierr;
141   PetscInt       n = A->rmap->n;
142 
143   PetscFunctionBegin;
144   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
145   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
146   (*B)->factortype = ftype;
147   (*B)->useordering = PETSC_TRUE;
148   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
149 
150   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
151     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
152     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
153     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
154   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
155     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
156     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
157   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
158 
159   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
160   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
161   PetscFunctionReturn(0);
162 }
163 
164 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
165 {
166   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
167 
168   PetscFunctionBegin;
169   switch (op) {
170   case MAT_CUSPARSE_MULT:
171     cusparsestruct->format = format;
172     break;
173   case MAT_CUSPARSE_ALL:
174     cusparsestruct->format = format;
175     break;
176   default:
177     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
178   }
179   PetscFunctionReturn(0);
180 }
181 
182 /*@
183    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
184    operation. Only the MatMult operation can use different GPU storage formats
185    for MPIAIJCUSPARSE matrices.
186    Not Collective
187 
188    Input Parameters:
189 +  A - Matrix of type SEQAIJCUSPARSE
190 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
191 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
192 
193    Output Parameter:
194 
195    Level: intermediate
196 
197 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
198 @*/
199 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
200 {
201   PetscErrorCode ierr;
202 
203   PetscFunctionBegin;
204   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
205   ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
206   PetscFunctionReturn(0);
207 }
208 
209 /*@
210    MatSeqAIJCUSPARSESetGenerateTranspose - Sets the flag to explicitly generate the tranpose matrix before calling MatMultTranspose
211 
212    Collective on mat
213 
214    Input Parameters:
215 +  A - Matrix of type SEQAIJCUSPARSE
216 -  transgen - the boolean flag
217 
218    Level: intermediate
219 
220 .seealso: MATSEQAIJCUSPARSE
221 @*/
222 PetscErrorCode MatSeqAIJCUSPARSESetGenerateTranspose(Mat A,PetscBool transgen)
223 {
224   PetscErrorCode ierr;
225   PetscBool      flg;
226 
227   PetscFunctionBegin;
228   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
229   ierr = PetscObjectTypeCompare(((PetscObject)A),MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
230   if (flg) {
231     Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
232 
233     if (A->factortype) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
234     cusp->transgen = transgen;
235     if (!transgen) { /* need to destroy the transpose matrix if present to prevent from logic errors if transgen is set to true later */
236       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr);
237     }
238   }
239   PetscFunctionReturn(0);
240 }
241 
242 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
243 {
244   PetscErrorCode           ierr;
245   MatCUSPARSEStorageFormat format;
246   PetscBool                flg;
247   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
248 
249   PetscFunctionBegin;
250   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
251   if (A->factortype == MAT_FACTOR_NONE) {
252     PetscBool transgen = cusparsestruct->transgen;
253 
254     ierr = PetscOptionsBool("-mat_cusparse_transgen","Generate explicit transpose for MatMultTranspose","MatSeqAIJCUSPARSESetGenerateTranspose",transgen,&transgen,&flg);CHKERRQ(ierr);
255     if (flg) {ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A,transgen);CHKERRQ(ierr);}
256 
257     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
258                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
259     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);}
260 
261     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
262                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
263     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);}
264    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
265     cusparsestruct->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */
266     ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
267                             "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr);
268     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
269     if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
270 
271     cusparsestruct->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
272     ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
273                             "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr);
274     if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
275 
276     cusparsestruct->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
277     ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
278                             "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr);
279     if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
280    #endif
281   }
282   ierr = PetscOptionsTail();CHKERRQ(ierr);
283   PetscFunctionReturn(0);
284 }
285 
286 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
287 {
288   PetscErrorCode ierr;
289 
290   PetscFunctionBegin;
291   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
292   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
293   PetscFunctionReturn(0);
294 }
295 
296 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
297 {
298   PetscErrorCode ierr;
299 
300   PetscFunctionBegin;
301   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
302   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
303   PetscFunctionReturn(0);
304 }
305 
306 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
307 {
308   PetscErrorCode ierr;
309 
310   PetscFunctionBegin;
311   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
312   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
313   PetscFunctionReturn(0);
314 }
315 
316 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
317 {
318   PetscErrorCode ierr;
319 
320   PetscFunctionBegin;
321   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
322   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
323   PetscFunctionReturn(0);
324 }
325 
326 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
327 {
328   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
329   PetscInt                          n = A->rmap->n;
330   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
331   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
332   cusparseStatus_t                  stat;
333   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
334   const MatScalar                   *aa = a->a,*v;
335   PetscInt                          *AiLo, *AjLo;
336   PetscScalar                       *AALo;
337   PetscInt                          i,nz, nzLower, offset, rowOffset;
338   PetscErrorCode                    ierr;
339   cudaError_t                       cerr;
340 
341   PetscFunctionBegin;
342   if (!n) PetscFunctionReturn(0);
343   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
344     try {
345       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
346       nzLower=n+ai[n]-ai[1];
347 
348       /* Allocate Space for the lower triangular matrix */
349       cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
350       cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);
351       cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
352 
353       /* Fill the lower triangular matrix */
354       AiLo[0]  = (PetscInt) 0;
355       AiLo[n]  = nzLower;
356       AjLo[0]  = (PetscInt) 0;
357       AALo[0]  = (MatScalar) 1.0;
358       v        = aa;
359       vi       = aj;
360       offset   = 1;
361       rowOffset= 1;
362       for (i=1; i<n; i++) {
363         nz = ai[i+1] - ai[i];
364         /* additional 1 for the term on the diagonal */
365         AiLo[i]    = rowOffset;
366         rowOffset += nz+1;
367 
368         ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
369         ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
370 
371         offset      += nz;
372         AjLo[offset] = (PetscInt) i;
373         AALo[offset] = (MatScalar) 1.0;
374         offset      += 1;
375 
376         v  += nz;
377         vi += nz;
378       }
379 
380       /* allocate space for the triangular factor information */
381       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
382 
383       /* Create the matrix description */
384       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
385       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
386      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
387       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
388      #else
389       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
390      #endif
391       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
392       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
393 
394       /* set the operation */
395       loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
396 
397       /* set the matrix */
398       loTriFactor->csrMat = new CsrMatrix;
399       loTriFactor->csrMat->num_rows = n;
400       loTriFactor->csrMat->num_cols = n;
401       loTriFactor->csrMat->num_entries = nzLower;
402 
403       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
404       loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
405 
406       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
407       loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
408 
409       loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
410       loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
411 
412       /* Create the solve analysis information */
413       stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
414     #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
415       stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
416                                      loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
417                                      loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
418                                      loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
419                                      &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
420       cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
421     #endif
422 
423       /* perform the solve analysis */
424       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
425                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
426                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
427                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
428                              #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
429                                ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
430                              #endif
431                               );CHKERRCUSPARSE(stat);
432 
433       /* assign the pointer. Is this really necessary? */
434       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
435 
436       cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
437       cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
438       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
439       ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
440     } catch(char *ex) {
441       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
442     }
443   }
444   PetscFunctionReturn(0);
445 }
446 
447 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
448 {
449   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
450   PetscInt                          n = A->rmap->n;
451   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
452   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
453   cusparseStatus_t                  stat;
454   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
455   const MatScalar                   *aa = a->a,*v;
456   PetscInt                          *AiUp, *AjUp;
457   PetscScalar                       *AAUp;
458   PetscInt                          i,nz, nzUpper, offset;
459   PetscErrorCode                    ierr;
460   cudaError_t                       cerr;
461 
462   PetscFunctionBegin;
463   if (!n) PetscFunctionReturn(0);
464   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
465     try {
466       /* next, figure out the number of nonzeros in the upper triangular matrix. */
467       nzUpper = adiag[0]-adiag[n];
468 
469       /* Allocate Space for the upper triangular matrix */
470       cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
471       cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
472       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
473 
474       /* Fill the upper triangular matrix */
475       AiUp[0]=(PetscInt) 0;
476       AiUp[n]=nzUpper;
477       offset = nzUpper;
478       for (i=n-1; i>=0; i--) {
479         v  = aa + adiag[i+1] + 1;
480         vi = aj + adiag[i+1] + 1;
481 
482         /* number of elements NOT on the diagonal */
483         nz = adiag[i] - adiag[i+1]-1;
484 
485         /* decrement the offset */
486         offset -= (nz+1);
487 
488         /* first, set the diagonal elements */
489         AjUp[offset] = (PetscInt) i;
490         AAUp[offset] = (MatScalar)1./v[nz];
491         AiUp[i]      = AiUp[i+1] - (nz+1);
492 
493         ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
494         ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
495       }
496 
497       /* allocate space for the triangular factor information */
498       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
499 
500       /* Create the matrix description */
501       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
502       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
503      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
504       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
505      #else
506       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
507      #endif
508       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
509       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
510 
511       /* set the operation */
512       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
513 
514       /* set the matrix */
515       upTriFactor->csrMat = new CsrMatrix;
516       upTriFactor->csrMat->num_rows = n;
517       upTriFactor->csrMat->num_cols = n;
518       upTriFactor->csrMat->num_entries = nzUpper;
519 
520       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
521       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
522 
523       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
524       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
525 
526       upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
527       upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
528 
529       /* Create the solve analysis information */
530       stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
531     #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
532       stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
533                                    upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
534                                    upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
535                                    upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
536                                    &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
537       cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
538     #endif
539 
540       /* perform the solve analysis */
541       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
542                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
543                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
544                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
545                              #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
546                                ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
547                              #endif
548                               );CHKERRCUSPARSE(stat);
549 
550       /* assign the pointer. Is this really necessary? */
551       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
552 
553       cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
554       cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
555       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
556       ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
557     } catch(char *ex) {
558       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
559     }
560   }
561   PetscFunctionReturn(0);
562 }
563 
564 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
565 {
566   PetscErrorCode               ierr;
567   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
568   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
569   IS                           isrow = a->row,iscol = a->icol;
570   PetscBool                    row_identity,col_identity;
571   const PetscInt               *r,*c;
572   PetscInt                     n = A->rmap->n;
573 
574   PetscFunctionBegin;
575   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
576   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
577   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
578 
579   cusparseTriFactors->workVector = new THRUSTARRAY(n);
580   cusparseTriFactors->nnz=a->nz;
581 
582   A->offloadmask = PETSC_OFFLOAD_BOTH;
583   /* lower triangular indices */
584   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
585   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
586   if (!row_identity) {
587     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
588     cusparseTriFactors->rpermIndices->assign(r, r+n);
589   }
590   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
591 
592   /* upper triangular indices */
593   ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
594   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
595   if (!col_identity) {
596     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
597     cusparseTriFactors->cpermIndices->assign(c, c+n);
598   }
599 
600   if (!row_identity && !col_identity) {
601     ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr);
602   } else if(!row_identity) {
603     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
604   } else if(!col_identity) {
605     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
606   }
607 
608   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
609   PetscFunctionReturn(0);
610 }
611 
612 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
613 {
614   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
615   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
616   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
617   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
618   cusparseStatus_t                  stat;
619   PetscErrorCode                    ierr;
620   cudaError_t                       cerr;
621   PetscInt                          *AiUp, *AjUp;
622   PetscScalar                       *AAUp;
623   PetscScalar                       *AALo;
624   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
625   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
626   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
627   const MatScalar                   *aa = b->a,*v;
628 
629   PetscFunctionBegin;
630   if (!n) PetscFunctionReturn(0);
631   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
632     try {
633       /* Allocate Space for the upper triangular matrix */
634       cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
635       cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
636       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
637       cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
638 
639       /* Fill the upper triangular matrix */
640       AiUp[0]=(PetscInt) 0;
641       AiUp[n]=nzUpper;
642       offset = 0;
643       for (i=0; i<n; i++) {
644         /* set the pointers */
645         v  = aa + ai[i];
646         vj = aj + ai[i];
647         nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
648 
649         /* first, set the diagonal elements */
650         AjUp[offset] = (PetscInt) i;
651         AAUp[offset] = (MatScalar)1.0/v[nz];
652         AiUp[i]      = offset;
653         AALo[offset] = (MatScalar)1.0/v[nz];
654 
655         offset+=1;
656         if (nz>0) {
657           ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
658           ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
659           for (j=offset; j<offset+nz; j++) {
660             AAUp[j] = -AAUp[j];
661             AALo[j] = AAUp[j]/v[nz];
662           }
663           offset+=nz;
664         }
665       }
666 
667       /* allocate space for the triangular factor information */
668       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
669 
670       /* Create the matrix description */
671       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
672       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
673      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
674       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
675      #else
676       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
677      #endif
678       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
679       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
680 
681       /* set the matrix */
682       upTriFactor->csrMat = new CsrMatrix;
683       upTriFactor->csrMat->num_rows = A->rmap->n;
684       upTriFactor->csrMat->num_cols = A->cmap->n;
685       upTriFactor->csrMat->num_entries = a->nz;
686 
687       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
688       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
689 
690       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
691       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
692 
693       upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
694       upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
695 
696       /* set the operation */
697       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
698 
699       /* Create the solve analysis information */
700       stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
701     #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
702       stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
703                                      upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
704                                      upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
705                                      upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
706                                      &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
707       cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
708     #endif
709 
710       /* perform the solve analysis */
711       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
712                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
713                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
714                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
715                               #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
716                                ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
717                               #endif
718                               );CHKERRCUSPARSE(stat);
719 
720       /* assign the pointer. Is this really necessary? */
721       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
722 
723       /* allocate space for the triangular factor information */
724       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
725 
726       /* Create the matrix description */
727       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
728       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
729      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
730       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
731      #else
732       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
733      #endif
734       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
735       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
736 
737       /* set the operation */
738       loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
739 
740       /* set the matrix */
741       loTriFactor->csrMat = new CsrMatrix;
742       loTriFactor->csrMat->num_rows = A->rmap->n;
743       loTriFactor->csrMat->num_cols = A->cmap->n;
744       loTriFactor->csrMat->num_entries = a->nz;
745 
746       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
747       loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
748 
749       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
750       loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
751 
752       loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
753       loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
754       ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
755 
756       /* Create the solve analysis information */
757       stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
758     #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
759       stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
760                                      loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
761                                      loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
762                                      loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
763                                      &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
764       cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
765     #endif
766 
767       /* perform the solve analysis */
768       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
769                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
770                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
771                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
772                               #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
773                                ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
774                               #endif
775                               );CHKERRCUSPARSE(stat);
776 
777       /* assign the pointer. Is this really necessary? */
778       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
779 
780       A->offloadmask = PETSC_OFFLOAD_BOTH;
781       cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
782       cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
783       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
784       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
785     } catch(char *ex) {
786       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
787     }
788   }
789   PetscFunctionReturn(0);
790 }
791 
792 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
793 {
794   PetscErrorCode               ierr;
795   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
796   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
797   IS                           ip = a->row;
798   const PetscInt               *rip;
799   PetscBool                    perm_identity;
800   PetscInt                     n = A->rmap->n;
801 
802   PetscFunctionBegin;
803   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
804   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
805   cusparseTriFactors->workVector = new THRUSTARRAY(n);
806   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
807 
808   /* lower triangular indices */
809   ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
810   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
811   if (!perm_identity) {
812     IS             iip;
813     const PetscInt *irip;
814 
815     ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr);
816     ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr);
817     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
818     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
819     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
820     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
821     ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr);
822     ierr = ISDestroy(&iip);CHKERRQ(ierr);
823     ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr);
824   }
825   ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
826   PetscFunctionReturn(0);
827 }
828 
829 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
830 {
831   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
832   IS             isrow = b->row,iscol = b->col;
833   PetscBool      row_identity,col_identity;
834   PetscErrorCode ierr;
835 
836   PetscFunctionBegin;
837   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
838   B->offloadmask = PETSC_OFFLOAD_CPU;
839   /* determine which version of MatSolve needs to be used. */
840   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
841   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
842   if (row_identity && col_identity) {
843     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
844     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
845     B->ops->matsolve = NULL;
846     B->ops->matsolvetranspose = NULL;
847   } else {
848     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
849     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
850     B->ops->matsolve = NULL;
851     B->ops->matsolvetranspose = NULL;
852   }
853 
854   /* get the triangular factors */
855   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
856   PetscFunctionReturn(0);
857 }
858 
859 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
860 {
861   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
862   IS             ip = b->row;
863   PetscBool      perm_identity;
864   PetscErrorCode ierr;
865 
866   PetscFunctionBegin;
867   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
868   B->offloadmask = PETSC_OFFLOAD_CPU;
869   /* determine which version of MatSolve needs to be used. */
870   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
871   if (perm_identity) {
872     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
873     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
874     B->ops->matsolve = NULL;
875     B->ops->matsolvetranspose = NULL;
876   } else {
877     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
878     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
879     B->ops->matsolve = NULL;
880     B->ops->matsolvetranspose = NULL;
881   }
882 
883   /* get the triangular factors */
884   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
885   PetscFunctionReturn(0);
886 }
887 
888 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
889 {
890   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
891   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
892   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
893   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
894   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
895   cusparseStatus_t                  stat;
896   cusparseIndexBase_t               indexBase;
897   cusparseMatrixType_t              matrixType;
898   cusparseFillMode_t                fillMode;
899   cusparseDiagType_t                diagType;
900 
901   PetscFunctionBegin;
902 
903   /*********************************************/
904   /* Now the Transpose of the Lower Tri Factor */
905   /*********************************************/
906 
907   /* allocate space for the transpose of the lower triangular factor */
908   loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
909 
910   /* set the matrix descriptors of the lower triangular factor */
911   matrixType = cusparseGetMatType(loTriFactor->descr);
912   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
913   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
914     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
915   diagType = cusparseGetMatDiagType(loTriFactor->descr);
916 
917   /* Create the matrix description */
918   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
919   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
920   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
921   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
922   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
923 
924   /* set the operation */
925   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
926 
927   /* allocate GPU space for the CSC of the lower triangular factor*/
928   loTriFactorT->csrMat = new CsrMatrix;
929   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
930   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
931   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
932   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
933   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
934   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
935 
936   /* compute the transpose of the lower triangular factor, i.e. the CSC */
937 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
938   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
939                                        loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
940                                        loTriFactor->csrMat->values->data().get(),
941                                        loTriFactor->csrMat->row_offsets->data().get(),
942                                        loTriFactor->csrMat->column_indices->data().get(),
943                                        loTriFactorT->csrMat->values->data().get(),
944                                        loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
945                                        CUSPARSE_ACTION_NUMERIC,indexBase,
946                                        CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
947   cudaError_t cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
948 #endif
949 
950   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
951                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
952                           loTriFactor->csrMat->values->data().get(),
953                           loTriFactor->csrMat->row_offsets->data().get(),
954                           loTriFactor->csrMat->column_indices->data().get(),
955                           loTriFactorT->csrMat->values->data().get(),
956                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
957                           loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
958                           CUSPARSE_ACTION_NUMERIC, indexBase,
959                           CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer
960                         #else
961                           loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
962                           CUSPARSE_ACTION_NUMERIC, indexBase
963                         #endif
964                         );CHKERRCUSPARSE(stat);
965 
966   /* Create the solve analysis information */
967   stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
968 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
969   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
970                                 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
971                                 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
972                                 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
973                                 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
974   cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
975 #endif
976 
977   /* perform the solve analysis */
978   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
979                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
980                            loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
981                            loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo
982                           #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
983                            ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
984                           #endif
985                           );CHKERRCUSPARSE(stat);
986 
987   /* assign the pointer. Is this really necessary? */
988   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
989 
990   /*********************************************/
991   /* Now the Transpose of the Upper Tri Factor */
992   /*********************************************/
993 
994   /* allocate space for the transpose of the upper triangular factor */
995   upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
996 
997   /* set the matrix descriptors of the upper triangular factor */
998   matrixType = cusparseGetMatType(upTriFactor->descr);
999   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1000   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1001     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1002   diagType = cusparseGetMatDiagType(upTriFactor->descr);
1003 
1004   /* Create the matrix description */
1005   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
1006   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1007   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1008   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1009   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1010 
1011   /* set the operation */
1012   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1013 
1014   /* allocate GPU space for the CSC of the upper triangular factor*/
1015   upTriFactorT->csrMat = new CsrMatrix;
1016   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1017   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1018   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1019   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1020   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1021   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1022 
1023   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1024 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1025   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1026                                 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1027                                 upTriFactor->csrMat->values->data().get(),
1028                                 upTriFactor->csrMat->row_offsets->data().get(),
1029                                 upTriFactor->csrMat->column_indices->data().get(),
1030                                 upTriFactorT->csrMat->values->data().get(),
1031                                 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1032                                 CUSPARSE_ACTION_NUMERIC,indexBase,
1033                                 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1034   cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1035 #endif
1036 
1037   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1038                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1039                           upTriFactor->csrMat->values->data().get(),
1040                           upTriFactor->csrMat->row_offsets->data().get(),
1041                           upTriFactor->csrMat->column_indices->data().get(),
1042                           upTriFactorT->csrMat->values->data().get(),
1043                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1044                           upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1045                           CUSPARSE_ACTION_NUMERIC, indexBase,
1046                           CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer
1047                         #else
1048                           upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1049                           CUSPARSE_ACTION_NUMERIC, indexBase
1050                         #endif
1051                         );CHKERRCUSPARSE(stat);
1052 
1053   /* Create the solve analysis information */
1054   stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1055   #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1056   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1057                                  upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1058                                  upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1059                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
1060                                  &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1061   cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1062   #endif
1063 
1064   /* perform the solve analysis */
1065   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1066                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1067                            upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1068                            upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo
1069                           #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1070                            ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1071                           #endif
1072                           );CHKERRCUSPARSE(stat);
1073 
1074   /* assign the pointer. Is this really necessary? */
1075   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1076   PetscFunctionReturn(0);
1077 }
1078 
1079 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
1080 {
1081   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1082   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1083   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1084   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1085   cusparseStatus_t             stat;
1086   cusparseIndexBase_t          indexBase;
1087   cudaError_t                  err;
1088   PetscErrorCode               ierr;
1089 
1090   PetscFunctionBegin;
1091   if (!cusparsestruct->transgen || cusparsestruct->matTranspose) PetscFunctionReturn(0);
1092   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1093   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1094   /* create cusparse matrix */
1095   matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1096   stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
1097   indexBase = cusparseGetMatIndexBase(matstruct->descr);
1098   stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
1099   stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1100 
1101   /* set alpha and beta */
1102   err = cudaMalloc((void **)&(matstructT->alpha_one),    sizeof(PetscScalar));CHKERRCUDA(err);
1103   err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1104   err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1105   err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1106   err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1107   err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1108   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1109 
1110   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1111     CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
1112     CsrMatrix *matrixT= new CsrMatrix;
1113     matrixT->num_rows = A->cmap->n;
1114     matrixT->num_cols = A->rmap->n;
1115     matrixT->num_entries = a->nz;
1116     matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1117     matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1118     matrixT->values = new THRUSTARRAY(a->nz);
1119 
1120     cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
1121     cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
1122 
1123     /* compute the transpose, i.e. the CSC */
1124    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1125     stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1126                                   A->cmap->n, matrix->num_entries,
1127                                   matrix->values->data().get(),
1128                                   cusparsestruct->rowoffsets_gpu->data().get(),
1129                                   matrix->column_indices->data().get(),
1130                                   matrixT->values->data().get(),
1131                                   matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1132                                   CUSPARSE_ACTION_NUMERIC,indexBase,
1133                                   cusparsestruct->csr2cscAlg, &cusparsestruct->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1134     err = cudaMalloc(&cusparsestruct->csr2cscBuffer,cusparsestruct->csr2cscBufferSize);CHKERRCUDA(err);
1135    #endif
1136 
1137     stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1138                             A->cmap->n, matrix->num_entries,
1139                             matrix->values->data().get(),
1140                             cusparsestruct->rowoffsets_gpu->data().get(),
1141                             matrix->column_indices->data().get(),
1142                             matrixT->values->data().get(),
1143                           #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1144                             matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1145                             CUSPARSE_ACTION_NUMERIC,indexBase,
1146                             cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer
1147                           #else
1148                             matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1149                             CUSPARSE_ACTION_NUMERIC, indexBase
1150                           #endif
1151                            );CHKERRCUSPARSE(stat);
1152     matstructT->mat = matrixT;
1153 
1154    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1155     stat = cusparseCreateCsr(&matstructT->matDescr,
1156                              matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1157                              matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1158                              matrixT->values->data().get(),
1159                              CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1160                              indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
1161    #endif
1162   } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1163    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1164     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1165    #else
1166     CsrMatrix *temp  = new CsrMatrix;
1167     CsrMatrix *tempT = new CsrMatrix;
1168     /* First convert HYB to CSR */
1169     temp->num_rows = A->rmap->n;
1170     temp->num_cols = A->cmap->n;
1171     temp->num_entries = a->nz;
1172     temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1173     temp->column_indices = new THRUSTINTARRAY32(a->nz);
1174     temp->values = new THRUSTARRAY(a->nz);
1175 
1176     stat = cusparse_hyb2csr(cusparsestruct->handle,
1177                             matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1178                             temp->values->data().get(),
1179                             temp->row_offsets->data().get(),
1180                             temp->column_indices->data().get());CHKERRCUSPARSE(stat);
1181 
1182     /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1183     tempT->num_rows = A->rmap->n;
1184     tempT->num_cols = A->cmap->n;
1185     tempT->num_entries = a->nz;
1186     tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1187     tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1188     tempT->values = new THRUSTARRAY(a->nz);
1189 
1190     stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1191                             temp->num_cols, temp->num_entries,
1192                             temp->values->data().get(),
1193                             temp->row_offsets->data().get(),
1194                             temp->column_indices->data().get(),
1195                             tempT->values->data().get(),
1196                             tempT->column_indices->data().get(),
1197                             tempT->row_offsets->data().get(),
1198                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1199 
1200     /* Last, convert CSC to HYB */
1201     cusparseHybMat_t hybMat;
1202     stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1203     cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1204       CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1205     stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1206                             matstructT->descr, tempT->values->data().get(),
1207                             tempT->row_offsets->data().get(),
1208                             tempT->column_indices->data().get(),
1209                             hybMat, 0, partition);CHKERRCUSPARSE(stat);
1210 
1211     /* assign the pointer */
1212     matstructT->mat = hybMat;
1213     /* delete temporaries */
1214     if (tempT) {
1215       if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1216       if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1217       if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1218       delete (CsrMatrix*) tempT;
1219     }
1220     if (temp) {
1221       if (temp->values) delete (THRUSTARRAY*) temp->values;
1222       if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1223       if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1224       delete (CsrMatrix*) temp;
1225     }
1226    #endif
1227   }
1228   err  = WaitForCUDA();CHKERRCUDA(err);
1229   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1230   ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1231   /* the compressed row indices is not used for matTranspose */
1232   matstructT->cprowIndices = NULL;
1233   /* assign the pointer */
1234   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1235   PetscFunctionReturn(0);
1236 }
1237 
1238 /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1239 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1240 {
1241   PetscInt                              n = xx->map->n;
1242   const PetscScalar                     *barray;
1243   PetscScalar                           *xarray;
1244   thrust::device_ptr<const PetscScalar> bGPU;
1245   thrust::device_ptr<PetscScalar>       xGPU;
1246   cusparseStatus_t                      stat;
1247   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1248   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1249   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1250   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1251   PetscErrorCode                        ierr;
1252   cudaError_t                           cerr;
1253 
1254   PetscFunctionBegin;
1255   /* Analyze the matrix and create the transpose ... on the fly */
1256   if (!loTriFactorT && !upTriFactorT) {
1257     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1258     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1259     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1260   }
1261 
1262   /* Get the GPU pointers */
1263   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1264   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1265   xGPU = thrust::device_pointer_cast(xarray);
1266   bGPU = thrust::device_pointer_cast(barray);
1267 
1268   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1269   /* First, reorder with the row permutation */
1270   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1271                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1272                xGPU);
1273 
1274   /* First, solve U */
1275   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1276                         upTriFactorT->csrMat->num_rows,
1277                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1278                         upTriFactorT->csrMat->num_entries,
1279                       #endif
1280                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1281                         upTriFactorT->csrMat->values->data().get(),
1282                         upTriFactorT->csrMat->row_offsets->data().get(),
1283                         upTriFactorT->csrMat->column_indices->data().get(),
1284                         upTriFactorT->solveInfo,
1285                         xarray, tempGPU->data().get()
1286                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1287                         ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1288                       #endif
1289                         );CHKERRCUSPARSE(stat);
1290 
1291   /* Then, solve L */
1292   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1293                         loTriFactorT->csrMat->num_rows,
1294                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1295                         loTriFactorT->csrMat->num_entries,
1296                       #endif
1297                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1298                         loTriFactorT->csrMat->values->data().get(),
1299                         loTriFactorT->csrMat->row_offsets->data().get(),
1300                         loTriFactorT->csrMat->column_indices->data().get(),
1301                         loTriFactorT->solveInfo,
1302                         tempGPU->data().get(), xarray
1303                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1304                         ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1305                       #endif
1306                         );CHKERRCUSPARSE(stat);
1307 
1308   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1309   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1310                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1311                tempGPU->begin());
1312 
1313   /* Copy the temporary to the full solution. */
1314   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1315 
1316   /* restore */
1317   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1318   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1319   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1320   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1321   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1322   PetscFunctionReturn(0);
1323 }
1324 
1325 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1326 {
1327   const PetscScalar                 *barray;
1328   PetscScalar                       *xarray;
1329   cusparseStatus_t                  stat;
1330   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1331   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1332   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1333   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1334   PetscErrorCode                    ierr;
1335   cudaError_t                       cerr;
1336 
1337   PetscFunctionBegin;
1338   /* Analyze the matrix and create the transpose ... on the fly */
1339   if (!loTriFactorT && !upTriFactorT) {
1340     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1341     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1342     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1343   }
1344 
1345   /* Get the GPU pointers */
1346   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1347   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1348 
1349   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1350   /* First, solve U */
1351   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1352                         upTriFactorT->csrMat->num_rows,
1353                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1354                         upTriFactorT->csrMat->num_entries,
1355                       #endif
1356                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1357                         upTriFactorT->csrMat->values->data().get(),
1358                         upTriFactorT->csrMat->row_offsets->data().get(),
1359                         upTriFactorT->csrMat->column_indices->data().get(),
1360                         upTriFactorT->solveInfo,
1361                         barray, tempGPU->data().get()
1362                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1363                         ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1364                       #endif
1365                         );CHKERRCUSPARSE(stat);
1366 
1367   /* Then, solve L */
1368   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1369                         loTriFactorT->csrMat->num_rows,
1370                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1371                         loTriFactorT->csrMat->num_entries,
1372                       #endif
1373                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1374                         loTriFactorT->csrMat->values->data().get(),
1375                         loTriFactorT->csrMat->row_offsets->data().get(),
1376                         loTriFactorT->csrMat->column_indices->data().get(),
1377                         loTriFactorT->solveInfo,
1378                         tempGPU->data().get(), xarray
1379                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1380                         ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1381                       #endif
1382                         );CHKERRCUSPARSE(stat);
1383 
1384   /* restore */
1385   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1386   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1387   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1388   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1389   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1390   PetscFunctionReturn(0);
1391 }
1392 
1393 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1394 {
1395   const PetscScalar                     *barray;
1396   PetscScalar                           *xarray;
1397   thrust::device_ptr<const PetscScalar> bGPU;
1398   thrust::device_ptr<PetscScalar>       xGPU;
1399   cusparseStatus_t                      stat;
1400   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1401   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1402   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1403   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1404   PetscErrorCode                        ierr;
1405   cudaError_t                           cerr;
1406 
1407   PetscFunctionBegin;
1408 
1409   /* Get the GPU pointers */
1410   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1411   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1412   xGPU = thrust::device_pointer_cast(xarray);
1413   bGPU = thrust::device_pointer_cast(barray);
1414 
1415   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1416   /* First, reorder with the row permutation */
1417   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1418                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1419                tempGPU->begin());
1420 
1421   /* Next, solve L */
1422   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1423                         loTriFactor->csrMat->num_rows,
1424                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1425                         loTriFactor->csrMat->num_entries,
1426                       #endif
1427                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1428                         loTriFactor->csrMat->values->data().get(),
1429                         loTriFactor->csrMat->row_offsets->data().get(),
1430                         loTriFactor->csrMat->column_indices->data().get(),
1431                         loTriFactor->solveInfo,
1432                         tempGPU->data().get(), xarray
1433                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1434                         ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1435                       #endif
1436                         );CHKERRCUSPARSE(stat);
1437 
1438   /* Then, solve U */
1439   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1440                         upTriFactor->csrMat->num_rows,
1441                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1442                         upTriFactor->csrMat->num_entries,
1443                       #endif
1444                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1445                         upTriFactor->csrMat->values->data().get(),
1446                         upTriFactor->csrMat->row_offsets->data().get(),
1447                         upTriFactor->csrMat->column_indices->data().get(),
1448                         upTriFactor->solveInfo,
1449                         xarray, tempGPU->data().get()
1450                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1451                         ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1452                       #endif
1453                         );CHKERRCUSPARSE(stat);
1454 
1455   /* Last, reorder with the column permutation */
1456   thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1457                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
1458                xGPU);
1459 
1460   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1461   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1462   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1463   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1464   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1465   PetscFunctionReturn(0);
1466 }
1467 
1468 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1469 {
1470   const PetscScalar                 *barray;
1471   PetscScalar                       *xarray;
1472   cusparseStatus_t                  stat;
1473   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1474   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1475   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1476   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1477   PetscErrorCode                    ierr;
1478   cudaError_t                       cerr;
1479 
1480   PetscFunctionBegin;
1481   /* Get the GPU pointers */
1482   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1483   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1484 
1485   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1486   /* First, solve L */
1487   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1488                         loTriFactor->csrMat->num_rows,
1489                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1490                         loTriFactor->csrMat->num_entries,
1491                       #endif
1492                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1493                         loTriFactor->csrMat->values->data().get(),
1494                         loTriFactor->csrMat->row_offsets->data().get(),
1495                         loTriFactor->csrMat->column_indices->data().get(),
1496                         loTriFactor->solveInfo,
1497                         barray, tempGPU->data().get()
1498                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1499                         ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1500                       #endif
1501                         );CHKERRCUSPARSE(stat);
1502 
1503   /* Next, solve U */
1504   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1505                         upTriFactor->csrMat->num_rows,
1506                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1507                         upTriFactor->csrMat->num_entries,
1508                       #endif
1509                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1510                         upTriFactor->csrMat->values->data().get(),
1511                         upTriFactor->csrMat->row_offsets->data().get(),
1512                         upTriFactor->csrMat->column_indices->data().get(),
1513                         upTriFactor->solveInfo,
1514                         tempGPU->data().get(), xarray
1515                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1516                         ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1517                       #endif
1518                         );CHKERRCUSPARSE(stat);
1519 
1520   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1521   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1522   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1523   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1524   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1525   PetscFunctionReturn(0);
1526 }
1527 
1528 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1529 {
1530   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1531   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
1532   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1533   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
1534   PetscErrorCode               ierr;
1535   cusparseStatus_t             stat;
1536   cudaError_t                  err;
1537 
1538   PetscFunctionBegin;
1539   if (A->boundtocpu) PetscFunctionReturn(0);
1540   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1541     if (A->was_assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) {
1542       /* Copy values only */
1543       CsrMatrix *matrix,*matrixT;
1544       matrix = (CsrMatrix*)cusparsestruct->mat->mat;
1545 
1546       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1547       matrix->values->assign(a->a, a->a+a->nz);
1548       err  = WaitForCUDA();CHKERRCUDA(err);
1549       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
1550       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1551 
1552       /* Update matT when it was built before */
1553       if (cusparsestruct->matTranspose) {
1554         cusparseIndexBase_t indexBase = cusparseGetMatIndexBase(cusparsestruct->mat->descr);
1555         matrixT = (CsrMatrix*)cusparsestruct->matTranspose->mat;
1556         ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1557         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1558                             A->cmap->n, matrix->num_entries,
1559                             matrix->values->data().get(),
1560                             cusparsestruct->rowoffsets_gpu->data().get(),
1561                             matrix->column_indices->data().get(),
1562                             matrixT->values->data().get(),
1563                           #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1564                             matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1565                             CUSPARSE_ACTION_NUMERIC,indexBase,
1566                             cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer
1567                           #else
1568                             matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1569                             CUSPARSE_ACTION_NUMERIC, indexBase
1570                           #endif
1571                            );CHKERRCUSPARSE(stat);
1572         err  = WaitForCUDA();CHKERRCUDA(err);
1573         ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1574       }
1575     } else {
1576       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1577       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr);
1578       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->matTranspose,cusparsestruct->format);CHKERRQ(ierr);
1579       delete cusparsestruct->workVector;
1580       delete cusparsestruct->rowoffsets_gpu;
1581       try {
1582         if (a->compressedrow.use) {
1583           m    = a->compressedrow.nrows;
1584           ii   = a->compressedrow.i;
1585           ridx = a->compressedrow.rindex;
1586         } else {
1587           m    = A->rmap->n;
1588           ii   = a->i;
1589           ridx = NULL;
1590         }
1591         cusparsestruct->nrows = m;
1592 
1593         /* create cusparse matrix */
1594         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1595         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
1596         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
1597         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1598 
1599         err = cudaMalloc((void **)&(matstruct->alpha_one),    sizeof(PetscScalar));CHKERRCUDA(err);
1600         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1601         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1602         err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1603         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1604         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1605         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1606 
1607         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1608         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1609           /* set the matrix */
1610           CsrMatrix *mat= new CsrMatrix;
1611           mat->num_rows = m;
1612           mat->num_cols = A->cmap->n;
1613           mat->num_entries = a->nz;
1614           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1615           mat->row_offsets->assign(ii, ii + m+1);
1616 
1617           mat->column_indices = new THRUSTINTARRAY32(a->nz);
1618           mat->column_indices->assign(a->j, a->j+a->nz);
1619 
1620           mat->values = new THRUSTARRAY(a->nz);
1621           mat->values->assign(a->a, a->a+a->nz);
1622 
1623           /* assign the pointer */
1624           matstruct->mat = mat;
1625          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1626           if (mat->num_rows) { /* cusparse errors on empty matrices! */
1627             stat = cusparseCreateCsr(&matstruct->matDescr,
1628                                     mat->num_rows, mat->num_cols, mat->num_entries,
1629                                     mat->row_offsets->data().get(), mat->column_indices->data().get(),
1630                                     mat->values->data().get(),
1631                                     CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1632                                     CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
1633           }
1634          #endif
1635         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1636          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1637           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1638          #else
1639           CsrMatrix *mat= new CsrMatrix;
1640           mat->num_rows = m;
1641           mat->num_cols = A->cmap->n;
1642           mat->num_entries = a->nz;
1643           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1644           mat->row_offsets->assign(ii, ii + m+1);
1645 
1646           mat->column_indices = new THRUSTINTARRAY32(a->nz);
1647           mat->column_indices->assign(a->j, a->j+a->nz);
1648 
1649           mat->values = new THRUSTARRAY(a->nz);
1650           mat->values->assign(a->a, a->a+a->nz);
1651 
1652           cusparseHybMat_t hybMat;
1653           stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1654           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1655             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1656           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
1657               matstruct->descr, mat->values->data().get(),
1658               mat->row_offsets->data().get(),
1659               mat->column_indices->data().get(),
1660               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1661           /* assign the pointer */
1662           matstruct->mat = hybMat;
1663 
1664           if (mat) {
1665             if (mat->values) delete (THRUSTARRAY*)mat->values;
1666             if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1667             if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1668             delete (CsrMatrix*)mat;
1669           }
1670          #endif
1671         }
1672 
1673         /* assign the compressed row indices */
1674         if (a->compressedrow.use) {
1675           cusparsestruct->workVector = new THRUSTARRAY(m);
1676           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
1677           matstruct->cprowIndices->assign(ridx,ridx+m);
1678           tmp = m;
1679         } else {
1680           cusparsestruct->workVector = NULL;
1681           matstruct->cprowIndices    = NULL;
1682           tmp = 0;
1683         }
1684         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
1685 
1686         /* assign the pointer */
1687         cusparsestruct->mat = matstruct;
1688       } catch(char *ex) {
1689         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1690       }
1691       err  = WaitForCUDA();CHKERRCUDA(err);
1692       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1693       cusparsestruct->nonzerostate = A->nonzerostate;
1694     }
1695     A->offloadmask = PETSC_OFFLOAD_BOTH;
1696   }
1697   PetscFunctionReturn(0);
1698 }
1699 
1700 struct VecCUDAPlusEquals
1701 {
1702   template <typename Tuple>
1703   __host__ __device__
1704   void operator()(Tuple t)
1705   {
1706     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1707   }
1708 };
1709 
1710 struct VecCUDAEqualsReverse
1711 {
1712   template <typename Tuple>
1713   __host__ __device__
1714   void operator()(Tuple t)
1715   {
1716     thrust::get<0>(t) = thrust::get<1>(t);
1717   }
1718 };
1719 
1720 struct MatMatCusparse {
1721   PetscBool            cisdense;
1722   PetscScalar          *Bt;
1723   Mat                  X;
1724 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1725   PetscBool            initialized;   /* C = alpha op(A) op(B) + beta C */
1726   cusparseDnMatDescr_t matBDescr;
1727   cusparseDnMatDescr_t matCDescr;
1728   size_t               spmmBufferSize;
1729   void                 *spmmBuffer;
1730   PetscInt             Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
1731 #endif
1732 };
1733 
1734 static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
1735 {
1736   PetscErrorCode ierr;
1737   MatMatCusparse *mmdata = (MatMatCusparse *)data;
1738   cudaError_t    cerr;
1739 
1740   PetscFunctionBegin;
1741   cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr);
1742  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1743   cusparseStatus_t stat;
1744   if (mmdata->matBDescr)  {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat);}
1745   if (mmdata->matCDescr)  {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat);}
1746   if (mmdata->spmmBuffer) {cerr = cudaFree(mmdata->spmmBuffer);CHKERRCUDA(cerr);}
1747  #endif
1748   ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr);
1749   ierr = PetscFree(data);CHKERRQ(ierr);
1750   PetscFunctionReturn(0);
1751 }
1752 
1753 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
1754 
1755 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1756 {
1757   Mat_Product                  *product = C->product;
1758   Mat                          A,B;
1759   PetscInt                     m,n,blda,clda;
1760   PetscBool                    flg,biscuda;
1761   Mat_SeqAIJCUSPARSE           *cusp;
1762   cusparseStatus_t             stat;
1763   cusparseOperation_t          opA;
1764   const PetscScalar            *barray;
1765   PetscScalar                  *carray;
1766   PetscErrorCode               ierr;
1767   MatMatCusparse               *mmdata;
1768   Mat_SeqAIJCUSPARSEMultStruct *mat;
1769   CsrMatrix                    *csrmat;
1770   cudaError_t                  cerr;
1771 
1772   PetscFunctionBegin;
1773   MatCheckProduct(C,1);
1774   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
1775   mmdata = (MatMatCusparse*)product->data;
1776   A    = product->A;
1777   B    = product->B;
1778   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
1779   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
1780   /* currently CopyToGpu does not copy if the matrix is bound to CPU
1781      Instead of silently accepting the wrong answer, I prefer to raise the error */
1782   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
1783   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1784   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1785   switch (product->type) {
1786   case MATPRODUCT_AB:
1787   case MATPRODUCT_PtAP:
1788     mat = cusp->mat;
1789     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1790     m   = A->rmap->n;
1791     n   = B->cmap->n;
1792     break;
1793   case MATPRODUCT_AtB:
1794     if (!cusp->transgen) {
1795       mat = cusp->mat;
1796       opA = CUSPARSE_OPERATION_TRANSPOSE;
1797     } else {
1798       ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1799       mat  = cusp->matTranspose;
1800       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
1801     }
1802     m = A->cmap->n;
1803     n = B->cmap->n;
1804     break;
1805   case MATPRODUCT_ABt:
1806   case MATPRODUCT_RARt:
1807     mat = cusp->mat;
1808     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1809     m   = A->rmap->n;
1810     n   = B->rmap->n;
1811     break;
1812   default:
1813     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
1814   }
1815   if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct");
1816   csrmat = (CsrMatrix*)mat->mat;
1817   /* if the user passed a CPU matrix, copy the data to the GPU */
1818   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr);
1819   if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);}
1820   ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr);
1821 
1822   ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr);
1823   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
1824     ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
1825     ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr);
1826   } else {
1827     ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr);
1828     ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr);
1829   }
1830 
1831   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1832  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1833   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
1834   /* (re)allcoate spmmBuffer if not initialized or LDAs are different */
1835   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
1836     if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;}
1837     if (!mmdata->matBDescr) {
1838       stat         = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
1839       mmdata->Blda = blda;
1840     }
1841 
1842     if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;}
1843     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
1844       stat         = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
1845       mmdata->Clda = clda;
1846     }
1847 
1848     if (!mat->matDescr) {
1849       stat = cusparseCreateCsr(&mat->matDescr,
1850                               csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
1851                               csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
1852                               csrmat->values->data().get(),
1853                               CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1854                               CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
1855     }
1856     stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
1857                                    mat->matDescr,mmdata->matBDescr,mat->beta_zero,
1858                                    mmdata->matCDescr,cusparse_scalartype,
1859                                    cusp->spmmAlg,&mmdata->spmmBufferSize);CHKERRCUSPARSE(stat);
1860     if (mmdata->spmmBuffer) {cerr = cudaFree(mmdata->spmmBuffer);CHKERRCUDA(cerr);}
1861     cerr = cudaMalloc(&mmdata->spmmBuffer,mmdata->spmmBufferSize);CHKERRCUDA(cerr);
1862     mmdata->initialized = PETSC_TRUE;
1863   } else {
1864     /* to be safe, always update pointers of the mats */
1865     stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat);
1866     stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat);
1867     stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat);
1868   }
1869 
1870   /* do cusparseSpMM, which supports transpose on B */
1871   stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
1872                       mat->matDescr,mmdata->matBDescr,mat->beta_zero,
1873                       mmdata->matCDescr,cusparse_scalartype,
1874                       cusp->spmmAlg,mmdata->spmmBuffer);CHKERRCUSPARSE(stat);
1875  #else
1876   PetscInt k;
1877   /* cusparseXcsrmm does not support transpose on B */
1878   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
1879     cublasHandle_t cublasv2handle;
1880     cublasStatus_t cerr;
1881 
1882     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
1883     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
1884                        B->cmap->n,B->rmap->n,
1885                        &PETSC_CUSPARSE_ONE ,barray,blda,
1886                        &PETSC_CUSPARSE_ZERO,barray,blda,
1887                        mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr);
1888     blda = B->cmap->n;
1889     k    = B->cmap->n;
1890   } else {
1891     k    = B->rmap->n;
1892   }
1893 
1894   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
1895   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
1896                            csrmat->num_entries,mat->alpha_one,mat->descr,
1897                            csrmat->values->data().get(),
1898                            csrmat->row_offsets->data().get(),
1899                            csrmat->column_indices->data().get(),
1900                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
1901                            carray,clda);CHKERRCUSPARSE(stat);
1902  #endif
1903   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1904   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1905   ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr);
1906   ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr);
1907   if (product->type == MATPRODUCT_RARt) {
1908     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
1909     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
1910   } else if (product->type == MATPRODUCT_PtAP) {
1911     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
1912     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
1913   } else {
1914     ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr);
1915   }
1916   if (mmdata->cisdense) {
1917     ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr);
1918   }
1919   if (!biscuda) {
1920     ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
1921   }
1922   PetscFunctionReturn(0);
1923 }
1924 
1925 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1926 {
1927   Mat_Product        *product = C->product;
1928   Mat                A,B;
1929   PetscInt           m,n;
1930   PetscBool          cisdense,flg;
1931   PetscErrorCode     ierr;
1932   MatMatCusparse     *mmdata;
1933   Mat_SeqAIJCUSPARSE *cusp;
1934 
1935   PetscFunctionBegin;
1936   MatCheckProduct(C,1);
1937   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
1938   A    = product->A;
1939   B    = product->B;
1940   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
1941   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
1942   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1943   if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
1944   switch (product->type) {
1945   case MATPRODUCT_AB:
1946     m = A->rmap->n;
1947     n = B->cmap->n;
1948     break;
1949   case MATPRODUCT_AtB:
1950     m = A->cmap->n;
1951     n = B->cmap->n;
1952     break;
1953   case MATPRODUCT_ABt:
1954     m = A->rmap->n;
1955     n = B->rmap->n;
1956     break;
1957   case MATPRODUCT_PtAP:
1958     m = B->cmap->n;
1959     n = B->cmap->n;
1960     break;
1961   case MATPRODUCT_RARt:
1962     m = B->rmap->n;
1963     n = B->rmap->n;
1964     break;
1965   default:
1966     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
1967   }
1968   ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
1969   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
1970   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr);
1971   ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr);
1972 
1973   /* product data */
1974   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
1975   mmdata->cisdense = cisdense;
1976  #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
1977   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
1978   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
1979     cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
1980   }
1981  #endif
1982   /* for these products we need intermediate storage */
1983   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
1984     ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr);
1985     ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr);
1986     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
1987       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr);
1988     } else {
1989       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr);
1990     }
1991   }
1992   C->product->data    = mmdata;
1993   C->product->destroy = MatDestroy_MatMatCusparse;
1994 
1995   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
1996   PetscFunctionReturn(0);
1997 }
1998 
1999 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
2000 
2001 /* handles dense B */
2002 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat C)
2003 {
2004   Mat_Product    *product = C->product;
2005   PetscErrorCode ierr;
2006 
2007   PetscFunctionBegin;
2008   MatCheckProduct(C,1);
2009   if (!product->A) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing A");
2010   if (product->A->boundtocpu) {
2011     ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(C);CHKERRQ(ierr);
2012     PetscFunctionReturn(0);
2013   }
2014   switch (product->type) {
2015   case MATPRODUCT_AB:
2016   case MATPRODUCT_AtB:
2017   case MATPRODUCT_ABt:
2018   case MATPRODUCT_PtAP:
2019   case MATPRODUCT_RARt:
2020     C->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2021   default:
2022     break;
2023   }
2024   PetscFunctionReturn(0);
2025 }
2026 
2027 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2028 {
2029   PetscErrorCode ierr;
2030 
2031   PetscFunctionBegin;
2032   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2033   PetscFunctionReturn(0);
2034 }
2035 
2036 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
2037 {
2038   PetscErrorCode ierr;
2039 
2040   PetscFunctionBegin;
2041   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2042   PetscFunctionReturn(0);
2043 }
2044 
2045 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2046 {
2047   PetscErrorCode ierr;
2048 
2049   PetscFunctionBegin;
2050   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
2051   PetscFunctionReturn(0);
2052 }
2053 
2054 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2055 {
2056   PetscErrorCode ierr;
2057 
2058   PetscFunctionBegin;
2059   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
2060   PetscFunctionReturn(0);
2061 }
2062 
2063 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2064 {
2065   PetscErrorCode ierr;
2066 
2067   PetscFunctionBegin;
2068   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2069   PetscFunctionReturn(0);
2070 }
2071 
2072 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
2073 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
2074 {
2075   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
2076   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
2077   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
2078   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
2079   PetscErrorCode               ierr;
2080   cudaError_t                  cerr;
2081   cusparseStatus_t             stat;
2082   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2083   PetscBool                    compressed;
2084 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2085   PetscInt                     nx,ny;
2086 #endif
2087 
2088   PetscFunctionBegin;
2089   if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Hermitian and not transpose not supported");
2090   if (!a->nonzerorowcnt) {
2091     if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);}
2092     PetscFunctionReturn(0);
2093   }
2094   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
2095   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2096   if (!trans) {
2097     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2098   } else {
2099     if (herm || !cusparsestruct->transgen) {
2100       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
2101       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2102     } else {
2103       if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);}
2104       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
2105     }
2106   }
2107   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
2108   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
2109 
2110   try {
2111     ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
2112     if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */
2113     else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */
2114 
2115     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2116     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2117       /* z = A x + beta y.
2118          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
2119          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
2120       */
2121       xptr = xarray;
2122       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
2123       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
2124      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2125       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
2126           allocated to accommodate different uses. So we get the length info directly from mat.
2127        */
2128       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2129         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2130         nx = mat->num_cols;
2131         ny = mat->num_rows;
2132       }
2133      #endif
2134     } else {
2135       /* z = A^T x + beta y
2136          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
2137          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
2138        */
2139       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
2140       dptr = zarray;
2141       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
2142       if (compressed) { /* Scatter x to work vector */
2143         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
2144         thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
2145                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2146                          VecCUDAEqualsReverse());
2147       }
2148      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2149       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2150         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2151         nx = mat->num_rows;
2152         ny = mat->num_cols;
2153       }
2154      #endif
2155     }
2156 
2157     /* csr_spmv does y = alpha op(A) x + beta y */
2158     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2159      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2160       if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
2161       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
2162         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2163         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2164         stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
2165                                 matstruct->matDescr,
2166                                 matstruct->cuSpMV[opA].vecXDescr, beta,
2167                                 matstruct->cuSpMV[opA].vecYDescr,
2168                                 cusparse_scalartype,
2169                                 cusparsestruct->spmvAlg,
2170                                 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat);
2171         cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr);
2172 
2173         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
2174       } else {
2175         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
2176         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat);
2177         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat);
2178       }
2179 
2180       stat = cusparseSpMV(cusparsestruct->handle, opA,
2181                                matstruct->alpha_one,
2182                                matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEGenerateTransposeForMult() */
2183                                matstruct->cuSpMV[opA].vecXDescr,
2184                                beta,
2185                                matstruct->cuSpMV[opA].vecYDescr,
2186                                cusparse_scalartype,
2187                                cusparsestruct->spmvAlg,
2188                                matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat);
2189      #else
2190       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2191       stat = cusparse_csr_spmv(cusparsestruct->handle, opA,
2192                                mat->num_rows, mat->num_cols,
2193                                mat->num_entries, matstruct->alpha_one, matstruct->descr,
2194                                mat->values->data().get(), mat->row_offsets->data().get(),
2195                                mat->column_indices->data().get(), xptr, beta,
2196                                dptr);CHKERRCUSPARSE(stat);
2197      #endif
2198     } else {
2199       if (cusparsestruct->nrows) {
2200        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2201         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2202        #else
2203         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
2204         stat = cusparse_hyb_spmv(cusparsestruct->handle, opA,
2205                                  matstruct->alpha_one, matstruct->descr, hybMat,
2206                                  xptr, beta,
2207                                  dptr);CHKERRCUSPARSE(stat);
2208        #endif
2209       }
2210     }
2211     cerr = WaitForCUDA();CHKERRCUDA(cerr);
2212     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2213 
2214     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2215       if (yy) { /* MatMultAdd: zz = A*xx + yy */
2216         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
2217           ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */
2218         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
2219           ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
2220         }
2221       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
2222         ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
2223       }
2224 
2225       /* ScatterAdd the result from work vector into the full vector when A is compressed */
2226       if (compressed) {
2227         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
2228 
2229         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2230         thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
2231                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2232                          VecCUDAPlusEquals());
2233         cerr = WaitForCUDA();CHKERRCUDA(cerr);
2234         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2235       }
2236     } else {
2237       if (yy && yy != zz) {
2238         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
2239       }
2240     }
2241     ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
2242     if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);}
2243     else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);}
2244   } catch(char *ex) {
2245     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
2246   }
2247   if (yy) {
2248     ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
2249   } else {
2250     ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr);
2251   }
2252   PetscFunctionReturn(0);
2253 }
2254 
2255 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2256 {
2257   PetscErrorCode ierr;
2258 
2259   PetscFunctionBegin;
2260   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2261   PetscFunctionReturn(0);
2262 }
2263 
2264 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
2265 {
2266   PetscErrorCode              ierr;
2267   PetscSplitCSRDataStructure  *d_mat = NULL, h_mat;
2268   PetscBool                   is_seq = PETSC_TRUE;
2269   PetscInt                    nnz_state = A->nonzerostate;
2270   PetscFunctionBegin;
2271   if (A->factortype == MAT_FACTOR_NONE) {
2272     d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
2273   }
2274   if (d_mat) {
2275     cudaError_t err;
2276     ierr = PetscInfo(A,"Assemble device matrix\n");CHKERRQ(ierr);
2277     err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
2278     nnz_state = h_mat.nonzerostate;
2279     is_seq = h_mat.seq;
2280   }
2281   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); // this does very little if assembled on GPU - call it?
2282   if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0);
2283   if (A->factortype == MAT_FACTOR_NONE && A->nonzerostate >= nnz_state && is_seq) { // assembled on CPU eventhough equiped for GPU
2284     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2285   } else if (nnz_state > A->nonzerostate) {
2286     A->offloadmask = PETSC_OFFLOAD_GPU;
2287   }
2288 
2289   PetscFunctionReturn(0);
2290 }
2291 
2292 /* --------------------------------------------------------------------------------*/
2293 /*@
2294    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
2295    (the default parallel PETSc format). This matrix will ultimately pushed down
2296    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
2297    assembly performance the user should preallocate the matrix storage by setting
2298    the parameter nz (or the array nnz).  By setting these parameters accurately,
2299    performance during matrix assembly can be increased by more than a factor of 50.
2300 
2301    Collective
2302 
2303    Input Parameters:
2304 +  comm - MPI communicator, set to PETSC_COMM_SELF
2305 .  m - number of rows
2306 .  n - number of columns
2307 .  nz - number of nonzeros per row (same for all rows)
2308 -  nnz - array containing the number of nonzeros in the various rows
2309          (possibly different for each row) or NULL
2310 
2311    Output Parameter:
2312 .  A - the matrix
2313 
2314    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
2315    MatXXXXSetPreallocation() paradgm instead of this routine directly.
2316    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
2317 
2318    Notes:
2319    If nnz is given then nz is ignored
2320 
2321    The AIJ format (also called the Yale sparse matrix format or
2322    compressed row storage), is fully compatible with standard Fortran 77
2323    storage.  That is, the stored row and column indices can begin at
2324    either one (as in Fortran) or zero.  See the users' manual for details.
2325 
2326    Specify the preallocated storage with either nz or nnz (not both).
2327    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
2328    allocation.  For large problems you MUST preallocate memory or you
2329    will get TERRIBLE performance, see the users' manual chapter on matrices.
2330 
2331    By default, this format uses inodes (identical nodes) when possible, to
2332    improve numerical efficiency of matrix-vector products and solves. We
2333    search for consecutive rows with the same nonzero structure, thereby
2334    reusing matrix information to achieve increased efficiency.
2335 
2336    Level: intermediate
2337 
2338 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
2339 @*/
2340 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
2341 {
2342   PetscErrorCode ierr;
2343 
2344   PetscFunctionBegin;
2345   ierr = MatCreate(comm,A);CHKERRQ(ierr);
2346   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
2347   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2348   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
2349   PetscFunctionReturn(0);
2350 }
2351 
2352 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
2353 {
2354   PetscErrorCode              ierr;
2355   PetscSplitCSRDataStructure  *d_mat = NULL;
2356 
2357   PetscFunctionBegin;
2358   if (A->factortype == MAT_FACTOR_NONE) {
2359     d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
2360     ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat = NULL;
2361     ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
2362   } else {
2363     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
2364   }
2365   if (d_mat) {
2366     Mat_SeqAIJ                 *a = (Mat_SeqAIJ*)A->data;
2367     cudaError_t                err;
2368     PetscSplitCSRDataStructure h_mat;
2369     ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr);
2370     err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
2371     if (h_mat.seq) {
2372       if (a->compressedrow.use) {
2373  	err = cudaFree(h_mat.diag.i);CHKERRCUDA(err);
2374       }
2375       err = cudaFree(d_mat);CHKERRCUDA(err);
2376     }
2377   }
2378   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
2379   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
2380   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
2381   ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
2382   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
2383   PetscFunctionReturn(0);
2384 }
2385 
2386 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
2387 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
2388 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
2389 {
2390   PetscErrorCode ierr;
2391 
2392   PetscFunctionBegin;
2393   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
2394   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
2395   PetscFunctionReturn(0);
2396 }
2397 
2398 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
2399 {
2400   PetscErrorCode ierr;
2401 
2402   PetscFunctionBegin;
2403   if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0);
2404   /* Currently, there is no case in which an AIJCUSPARSE matrix ever has its offloadmask set to PETS_OFFLOAD_GPU.
2405      If this changes, we need to implement a routine to update the CPU (host) version of the matrix from the GPU one.
2406      Right now, for safety we simply check for PETSC_OFFLOAD_GPU and have MatBindToCPU() error in this case.
2407      TODO: Add MatAIJCUSPARSECopyFromGPU() and make MatBindToCPU() functional for AIJCUSPARSE matries;
2408            can follow the example of MatBindToCPU_SeqAIJViennaCL(). */
2409   if (flg && A->offloadmask == PETSC_OFFLOAD_GPU) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"PETSC_OFFLOAD_GPU should not happen. Please report your use case to petsc-dev@mcs.anl.gov");
2410   /* TODO: add support for this? */
2411   if (flg) {
2412     A->ops->mult                      = MatMult_SeqAIJ;
2413     A->ops->multadd                   = MatMultAdd_SeqAIJ;
2414     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
2415     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
2416     A->ops->multhermitiantranspose    = NULL;
2417     A->ops->multhermitiantransposeadd = NULL;
2418     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
2419     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
2420   } else {
2421     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
2422     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
2423     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
2424     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
2425     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
2426     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
2427     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
2428     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
2429   }
2430   A->boundtocpu = flg;
2431   PetscFunctionReturn(0);
2432 }
2433 
2434 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
2435 {
2436   PetscSplitCSRDataStructure  *d_mat = NULL;
2437   PetscErrorCode               ierr;
2438   PetscFunctionBegin;
2439   if (A->factortype == MAT_FACTOR_NONE) {
2440     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
2441     d_mat = spptr->deviceMat;
2442   }
2443   if (d_mat) {
2444     Mat_SeqAIJ   *a = (Mat_SeqAIJ*)A->data;
2445     PetscInt     n = A->rmap->n, nnz = a->i[n];
2446     cudaError_t  err;
2447     PetscScalar  *vals;
2448     ierr = PetscInfo(A,"Zero device matrix\n");CHKERRQ(ierr);
2449     err = cudaMemcpy( &vals, &d_mat->diag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
2450     err = cudaMemset( vals, 0, (nnz)*sizeof(PetscScalar));CHKERRCUDA(err);
2451   }
2452   ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr);
2453 
2454   A->offloadmask = PETSC_OFFLOAD_BOTH;
2455 
2456   PetscFunctionReturn(0);
2457 }
2458 
2459 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
2460 {
2461   PetscErrorCode   ierr;
2462   cusparseStatus_t stat;
2463   Mat              B;
2464 
2465   PetscFunctionBegin;
2466   if (reuse == MAT_INITIAL_MATRIX) {
2467     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
2468   } else if (reuse == MAT_REUSE_MATRIX) {
2469     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
2470   }
2471   B = *newmat;
2472 
2473   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
2474   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
2475 
2476   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
2477     if (B->factortype == MAT_FACTOR_NONE) {
2478       Mat_SeqAIJCUSPARSE *spptr;
2479 
2480       ierr = PetscNew(&spptr);CHKERRQ(ierr);
2481       spptr->format = MAT_CUSPARSE_CSR;
2482       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
2483       B->spptr = spptr;
2484       spptr->deviceMat = NULL;
2485     } else {
2486       Mat_SeqAIJCUSPARSETriFactors *spptr;
2487 
2488       ierr = PetscNew(&spptr);CHKERRQ(ierr);
2489       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
2490       B->spptr = spptr;
2491     }
2492     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
2493   }
2494   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
2495   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
2496   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
2497   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
2498   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
2499   B->ops->zeroentries    = MatZeroEntries_SeqAIJCUSPARSE;
2500 
2501   ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr);
2502   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2503   ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
2504   PetscFunctionReturn(0);
2505 }
2506 
2507 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
2508 {
2509   PetscErrorCode ierr;
2510 
2511   PetscFunctionBegin;
2512   ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr);
2513   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
2514   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
2515   ierr = PetscObjectOptionsBegin((PetscObject)B);CHKERRQ(ierr);
2516   ierr = MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionsObject,B);CHKERRQ(ierr);
2517   ierr = PetscOptionsEnd();CHKERRQ(ierr);
2518   PetscFunctionReturn(0);
2519 }
2520 
2521 /*MC
2522    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
2523 
2524    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
2525    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
2526    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
2527 
2528    Options Database Keys:
2529 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
2530 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
2531 -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
2532 
2533   Level: beginner
2534 
2535 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
2536 M*/
2537 
2538 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);
2539 
2540 
2541 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
2542 {
2543   PetscErrorCode ierr;
2544 
2545   PetscFunctionBegin;
2546   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
2547   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
2548   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
2549   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
2550   PetscFunctionReturn(0);
2551 }
2552 
2553 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
2554 {
2555   PetscErrorCode   ierr;
2556   cusparseStatus_t stat;
2557   cusparseHandle_t handle;
2558 
2559   PetscFunctionBegin;
2560   if (*cusparsestruct) {
2561     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr);
2562     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr);
2563     delete (*cusparsestruct)->workVector;
2564     delete (*cusparsestruct)->rowoffsets_gpu;
2565     if (handle = (*cusparsestruct)->handle) {stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);}
2566    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2567     cudaError_t cerr = cudaFree((*cusparsestruct)->csr2cscBuffer);CHKERRCUDA(cerr);
2568    #endif
2569     ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr);
2570   }
2571   PetscFunctionReturn(0);
2572 }
2573 
2574 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
2575 {
2576   PetscFunctionBegin;
2577   if (*mat) {
2578     delete (*mat)->values;
2579     delete (*mat)->column_indices;
2580     delete (*mat)->row_offsets;
2581     delete *mat;
2582     *mat = 0;
2583   }
2584   PetscFunctionReturn(0);
2585 }
2586 
2587 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
2588 {
2589   cusparseStatus_t stat;
2590   PetscErrorCode   ierr;
2591 
2592   PetscFunctionBegin;
2593   if (*trifactor) {
2594     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
2595     if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
2596     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
2597    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2598     cudaError_t cerr;
2599     if ((*trifactor)->solveBuffer)   {cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);}
2600     if ((*trifactor)->csr2cscBuffer) {cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);}
2601    #endif
2602     delete *trifactor;
2603     *trifactor = 0;
2604   }
2605   PetscFunctionReturn(0);
2606 }
2607 
2608 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
2609 {
2610   CsrMatrix        *mat;
2611   cusparseStatus_t stat;
2612   cudaError_t      err;
2613 
2614   PetscFunctionBegin;
2615   if (*matstruct) {
2616     if ((*matstruct)->mat) {
2617       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
2618        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2619         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2620        #else
2621         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
2622         stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
2623        #endif
2624       } else {
2625         mat = (CsrMatrix*)(*matstruct)->mat;
2626         CsrMatrix_Destroy(&mat);
2627       }
2628     }
2629     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
2630     delete (*matstruct)->cprowIndices;
2631     if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); }
2632     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
2633     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
2634 
2635    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2636     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
2637     if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);}
2638     for (int i=0; i<3; i++) {
2639       if (mdata->cuSpMV[i].initialized) {
2640         err  = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err);
2641         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat);
2642         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat);
2643       }
2644     }
2645    #endif
2646     delete *matstruct;
2647     *matstruct = 0;
2648   }
2649   PetscFunctionReturn(0);
2650 }
2651 
2652 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors)
2653 {
2654   PetscErrorCode ierr;
2655 
2656   PetscFunctionBegin;
2657   if (*trifactors) {
2658     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr);
2659     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr);
2660     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr);
2661     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr);
2662     delete (*trifactors)->rpermIndices;
2663     delete (*trifactors)->cpermIndices;
2664     delete (*trifactors)->workVector;
2665     (*trifactors)->rpermIndices = 0;
2666     (*trifactors)->cpermIndices = 0;
2667     (*trifactors)->workVector = 0;
2668   }
2669   PetscFunctionReturn(0);
2670 }
2671 
2672 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
2673 {
2674   PetscErrorCode   ierr;
2675   cusparseHandle_t handle;
2676   cusparseStatus_t stat;
2677 
2678   PetscFunctionBegin;
2679   if (*trifactors) {
2680     ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr);
2681     if (handle = (*trifactors)->handle) {
2682       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
2683     }
2684     ierr = PetscFree(*trifactors);CHKERRQ(ierr);
2685   }
2686   PetscFunctionReturn(0);
2687 }
2688