xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 1b0a6780d9463fdb93faab9d83a4082a63444bef)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 #define PETSC_SKIP_CXX_COMPLEX_FIX
7 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
8 
9 #include <petscconf.h>
10 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
11 #include <../src/mat/impls/sbaij/seq/sbaij.h>
12 #include <../src/vec/vec/impls/dvecimpl.h>
13 #include <petsc/private/vecimpl.h>
14 #undef VecType
15 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
16 
17 const char *const MatCUSPARSEStorageFormats[]    = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
18 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
19   /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
20     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
21 
22   typedef enum {
23       CUSPARSE_MV_ALG_DEFAULT = 0,
24       CUSPARSE_COOMV_ALG      = 1,
25       CUSPARSE_CSRMV_ALG1     = 2,
26       CUSPARSE_CSRMV_ALG2     = 3
27   } cusparseSpMVAlg_t;
28 
29   typedef enum {
30       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
31       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
32       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
33       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
34       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
35       CUSPARSE_SPMM_ALG_DEFAULT = 0,
36       CUSPARSE_SPMM_COO_ALG1    = 1,
37       CUSPARSE_SPMM_COO_ALG2    = 2,
38       CUSPARSE_SPMM_COO_ALG3    = 3,
39       CUSPARSE_SPMM_COO_ALG4    = 5,
40       CUSPARSE_SPMM_CSR_ALG1    = 4,
41       CUSPARSE_SPMM_CSR_ALG2    = 6,
42   } cusparseSpMMAlg_t;
43 
44   typedef enum {
45       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
46       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
47   } cusparseCsr2CscAlg_t;
48   */
49   const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
50   const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
51   const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
52 #endif
53 
54 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
55 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
56 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
57 
58 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
59 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
60 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
61 
62 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
63 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
64 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
65 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
66 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
67 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
68 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
69 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
70 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
71 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
72 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
73 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
74 
75 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
76 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
77 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
78 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**);
79 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
80 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
81 
82 PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]);
83 PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode);
84 
85 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
86 {
87   cusparseStatus_t   stat;
88   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
89 
90   PetscFunctionBegin;
91   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
92   cusparsestruct->stream = stream;
93   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
94   PetscFunctionReturn(0);
95 }
96 
97 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
98 {
99   cusparseStatus_t   stat;
100   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
101 
102   PetscFunctionBegin;
103   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
104   if (cusparsestruct->handle != handle) {
105     if (cusparsestruct->handle) {
106       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
107     }
108     cusparsestruct->handle = handle;
109   }
110   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
111   PetscFunctionReturn(0);
112 }
113 
114 PetscErrorCode MatCUSPARSEClearHandle(Mat A)
115 {
116   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
117   PetscBool          flg;
118   PetscErrorCode     ierr;
119 
120   PetscFunctionBegin;
121   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
122   if (!flg || !cusparsestruct) PetscFunctionReturn(0);
123   if (cusparsestruct->handle) cusparsestruct->handle = 0;
124   PetscFunctionReturn(0);
125 }
126 
127 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
128 {
129   PetscFunctionBegin;
130   *type = MATSOLVERCUSPARSE;
131   PetscFunctionReturn(0);
132 }
133 
134 /*MC
135   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
136   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
137   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
138   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
139   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
140   algorithms are not recommended. This class does NOT support direct solver operations.
141 
142   Level: beginner
143 
144 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
145 M*/
146 
147 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
148 {
149   PetscErrorCode ierr;
150   PetscInt       n = A->rmap->n;
151 
152   PetscFunctionBegin;
153   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
154   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
155   (*B)->factortype = ftype;
156   (*B)->useordering = PETSC_TRUE;
157   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
158 
159   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
160     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
161     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
162     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
163   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
164     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
165     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
166   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
167 
168   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
169   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
170   PetscFunctionReturn(0);
171 }
172 
173 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
174 {
175   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
176 
177   PetscFunctionBegin;
178   switch (op) {
179   case MAT_CUSPARSE_MULT:
180     cusparsestruct->format = format;
181     break;
182   case MAT_CUSPARSE_ALL:
183     cusparsestruct->format = format;
184     break;
185   default:
186     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
187   }
188   PetscFunctionReturn(0);
189 }
190 
191 /*@
192    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
193    operation. Only the MatMult operation can use different GPU storage formats
194    for MPIAIJCUSPARSE matrices.
195    Not Collective
196 
197    Input Parameters:
198 +  A - Matrix of type SEQAIJCUSPARSE
199 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
200 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
201 
202    Output Parameter:
203 
204    Level: intermediate
205 
206 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
207 @*/
208 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
209 {
210   PetscErrorCode ierr;
211 
212   PetscFunctionBegin;
213   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
214   ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
215   PetscFunctionReturn(0);
216 }
217 
218 /*@
219    MatSeqAIJCUSPARSESetGenerateTranspose - Sets the flag to explicitly generate the tranpose matrix before calling MatMultTranspose
220 
221    Collective on mat
222 
223    Input Parameters:
224 +  A - Matrix of type SEQAIJCUSPARSE
225 -  transgen - the boolean flag
226 
227    Level: intermediate
228 
229 .seealso: MATSEQAIJCUSPARSE
230 @*/
231 PetscErrorCode MatSeqAIJCUSPARSESetGenerateTranspose(Mat A,PetscBool transgen)
232 {
233   PetscErrorCode ierr;
234   PetscBool      flg;
235 
236   PetscFunctionBegin;
237   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
238   ierr = PetscObjectTypeCompare(((PetscObject)A),MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
239   if (flg) {
240     Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
241 
242     if (A->factortype) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
243     cusp->transgen = transgen;
244     if (!transgen) { /* need to destroy the transpose matrix if present to prevent from logic errors if transgen is set to true later */
245       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr);
246     }
247   }
248   PetscFunctionReturn(0);
249 }
250 
251 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
252 {
253   PetscErrorCode           ierr;
254   MatCUSPARSEStorageFormat format;
255   PetscBool                flg;
256   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
257 
258   PetscFunctionBegin;
259   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
260   if (A->factortype == MAT_FACTOR_NONE) {
261     PetscBool transgen = cusparsestruct->transgen;
262 
263     ierr = PetscOptionsBool("-mat_cusparse_transgen","Generate explicit transpose for MatMultTranspose","MatSeqAIJCUSPARSESetGenerateTranspose",transgen,&transgen,&flg);CHKERRQ(ierr);
264     if (flg) {ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A,transgen);CHKERRQ(ierr);}
265 
266     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
267                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
268     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);}
269 
270     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
271                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
272     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);}
273    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
274     cusparsestruct->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */
275     ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
276                             "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr);
277     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
278     if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
279 
280     cusparsestruct->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
281     ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
282                             "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr);
283     if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
284 
285     cusparsestruct->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
286     ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
287                             "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr);
288     if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
289    #endif
290   }
291   ierr = PetscOptionsTail();CHKERRQ(ierr);
292   PetscFunctionReturn(0);
293 }
294 
295 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
296 {
297   PetscErrorCode ierr;
298 
299   PetscFunctionBegin;
300   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
301   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
302   PetscFunctionReturn(0);
303 }
304 
305 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
306 {
307   PetscErrorCode ierr;
308 
309   PetscFunctionBegin;
310   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
311   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
312   PetscFunctionReturn(0);
313 }
314 
315 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
316 {
317   PetscErrorCode ierr;
318 
319   PetscFunctionBegin;
320   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
321   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
322   PetscFunctionReturn(0);
323 }
324 
325 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
326 {
327   PetscErrorCode ierr;
328 
329   PetscFunctionBegin;
330   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
331   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
332   PetscFunctionReturn(0);
333 }
334 
335 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
336 {
337   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
338   PetscInt                          n = A->rmap->n;
339   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
340   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
341   cusparseStatus_t                  stat;
342   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
343   const MatScalar                   *aa = a->a,*v;
344   PetscInt                          *AiLo, *AjLo;
345   PetscScalar                       *AALo;
346   PetscInt                          i,nz, nzLower, offset, rowOffset;
347   PetscErrorCode                    ierr;
348   cudaError_t                       cerr;
349 
350   PetscFunctionBegin;
351   if (!n) PetscFunctionReturn(0);
352   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
353     try {
354       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
355       nzLower=n+ai[n]-ai[1];
356 
357       /* Allocate Space for the lower triangular matrix */
358       cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
359       cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);
360       cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
361 
362       /* Fill the lower triangular matrix */
363       AiLo[0]  = (PetscInt) 0;
364       AiLo[n]  = nzLower;
365       AjLo[0]  = (PetscInt) 0;
366       AALo[0]  = (MatScalar) 1.0;
367       v        = aa;
368       vi       = aj;
369       offset   = 1;
370       rowOffset= 1;
371       for (i=1; i<n; i++) {
372         nz = ai[i+1] - ai[i];
373         /* additional 1 for the term on the diagonal */
374         AiLo[i]    = rowOffset;
375         rowOffset += nz+1;
376 
377         ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
378         ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
379 
380         offset      += nz;
381         AjLo[offset] = (PetscInt) i;
382         AALo[offset] = (MatScalar) 1.0;
383         offset      += 1;
384 
385         v  += nz;
386         vi += nz;
387       }
388 
389       /* allocate space for the triangular factor information */
390       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
391 
392       /* Create the matrix description */
393       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
394       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
395      #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
396       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
397      #else
398       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
399      #endif
400       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
401       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
402 
403       /* set the operation */
404       loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
405 
406       /* set the matrix */
407       loTriFactor->csrMat = new CsrMatrix;
408       loTriFactor->csrMat->num_rows = n;
409       loTriFactor->csrMat->num_cols = n;
410       loTriFactor->csrMat->num_entries = nzLower;
411 
412       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
413       loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
414 
415       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
416       loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
417 
418       loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
419       loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
420 
421       /* Create the solve analysis information */
422       stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
423     #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
424       stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
425                                      loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
426                                      loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
427                                      loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
428                                      &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
429       cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
430     #endif
431 
432       /* perform the solve analysis */
433       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
434                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
435                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
436                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
437                              #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
438                                ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
439                              #endif
440                               );CHKERRCUSPARSE(stat);
441 
442       /* assign the pointer. Is this really necessary? */
443       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
444 
445       cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
446       cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
447       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
448       ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
449     } catch(char *ex) {
450       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
451     }
452   }
453   PetscFunctionReturn(0);
454 }
455 
456 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
457 {
458   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
459   PetscInt                          n = A->rmap->n;
460   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
461   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
462   cusparseStatus_t                  stat;
463   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
464   const MatScalar                   *aa = a->a,*v;
465   PetscInt                          *AiUp, *AjUp;
466   PetscScalar                       *AAUp;
467   PetscInt                          i,nz, nzUpper, offset;
468   PetscErrorCode                    ierr;
469   cudaError_t                       cerr;
470 
471   PetscFunctionBegin;
472   if (!n) PetscFunctionReturn(0);
473   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
474     try {
475       /* next, figure out the number of nonzeros in the upper triangular matrix. */
476       nzUpper = adiag[0]-adiag[n];
477 
478       /* Allocate Space for the upper triangular matrix */
479       cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
480       cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
481       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
482 
483       /* Fill the upper triangular matrix */
484       AiUp[0]=(PetscInt) 0;
485       AiUp[n]=nzUpper;
486       offset = nzUpper;
487       for (i=n-1; i>=0; i--) {
488         v  = aa + adiag[i+1] + 1;
489         vi = aj + adiag[i+1] + 1;
490 
491         /* number of elements NOT on the diagonal */
492         nz = adiag[i] - adiag[i+1]-1;
493 
494         /* decrement the offset */
495         offset -= (nz+1);
496 
497         /* first, set the diagonal elements */
498         AjUp[offset] = (PetscInt) i;
499         AAUp[offset] = (MatScalar)1./v[nz];
500         AiUp[i]      = AiUp[i+1] - (nz+1);
501 
502         ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
503         ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
504       }
505 
506       /* allocate space for the triangular factor information */
507       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
508 
509       /* Create the matrix description */
510       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
511       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
512      #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
513       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
514      #else
515       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
516      #endif
517       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
518       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
519 
520       /* set the operation */
521       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
522 
523       /* set the matrix */
524       upTriFactor->csrMat = new CsrMatrix;
525       upTriFactor->csrMat->num_rows = n;
526       upTriFactor->csrMat->num_cols = n;
527       upTriFactor->csrMat->num_entries = nzUpper;
528 
529       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
530       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
531 
532       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
533       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
534 
535       upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
536       upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
537 
538       /* Create the solve analysis information */
539       stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
540     #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
541       stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
542                                    upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
543                                    upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
544                                    upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
545                                    &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
546       cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
547     #endif
548 
549       /* perform the solve analysis */
550       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
551                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
552                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
553                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
554                              #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
555                                ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
556                              #endif
557                               );CHKERRCUSPARSE(stat);
558 
559       /* assign the pointer. Is this really necessary? */
560       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
561 
562       cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
563       cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
564       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
565       ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
566     } catch(char *ex) {
567       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
568     }
569   }
570   PetscFunctionReturn(0);
571 }
572 
573 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
574 {
575   PetscErrorCode               ierr;
576   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
577   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
578   IS                           isrow = a->row,iscol = a->icol;
579   PetscBool                    row_identity,col_identity;
580   const PetscInt               *r,*c;
581   PetscInt                     n = A->rmap->n;
582 
583   PetscFunctionBegin;
584   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
585   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
586   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
587 
588   cusparseTriFactors->workVector = new THRUSTARRAY(n);
589   cusparseTriFactors->nnz=a->nz;
590 
591   A->offloadmask = PETSC_OFFLOAD_BOTH;
592   /* lower triangular indices */
593   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
594   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
595   if (!row_identity) {
596     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
597     cusparseTriFactors->rpermIndices->assign(r, r+n);
598   }
599   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
600 
601   /* upper triangular indices */
602   ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
603   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
604   if (!col_identity) {
605     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
606     cusparseTriFactors->cpermIndices->assign(c, c+n);
607   }
608 
609   if (!row_identity && !col_identity) {
610     ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr);
611   } else if(!row_identity) {
612     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
613   } else if(!col_identity) {
614     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
615   }
616 
617   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
618   PetscFunctionReturn(0);
619 }
620 
621 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
622 {
623   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
624   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
625   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
626   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
627   cusparseStatus_t                  stat;
628   PetscErrorCode                    ierr;
629   cudaError_t                       cerr;
630   PetscInt                          *AiUp, *AjUp;
631   PetscScalar                       *AAUp;
632   PetscScalar                       *AALo;
633   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
634   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
635   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
636   const MatScalar                   *aa = b->a,*v;
637 
638   PetscFunctionBegin;
639   if (!n) PetscFunctionReturn(0);
640   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
641     try {
642       /* Allocate Space for the upper triangular matrix */
643       cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
644       cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
645       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
646       cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
647 
648       /* Fill the upper triangular matrix */
649       AiUp[0]=(PetscInt) 0;
650       AiUp[n]=nzUpper;
651       offset = 0;
652       for (i=0; i<n; i++) {
653         /* set the pointers */
654         v  = aa + ai[i];
655         vj = aj + ai[i];
656         nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
657 
658         /* first, set the diagonal elements */
659         AjUp[offset] = (PetscInt) i;
660         AAUp[offset] = (MatScalar)1.0/v[nz];
661         AiUp[i]      = offset;
662         AALo[offset] = (MatScalar)1.0/v[nz];
663 
664         offset+=1;
665         if (nz>0) {
666           ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
667           ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
668           for (j=offset; j<offset+nz; j++) {
669             AAUp[j] = -AAUp[j];
670             AALo[j] = AAUp[j]/v[nz];
671           }
672           offset+=nz;
673         }
674       }
675 
676       /* allocate space for the triangular factor information */
677       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
678 
679       /* Create the matrix description */
680       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
681       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
682      #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
683       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
684      #else
685       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
686      #endif
687       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
688       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
689 
690       /* set the matrix */
691       upTriFactor->csrMat = new CsrMatrix;
692       upTriFactor->csrMat->num_rows = A->rmap->n;
693       upTriFactor->csrMat->num_cols = A->cmap->n;
694       upTriFactor->csrMat->num_entries = a->nz;
695 
696       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
697       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
698 
699       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
700       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
701 
702       upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
703       upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
704 
705       /* set the operation */
706       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
707 
708       /* Create the solve analysis information */
709       stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
710     #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
711       stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
712                                      upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
713                                      upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
714                                      upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
715                                      &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
716       cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
717     #endif
718 
719       /* perform the solve analysis */
720       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
721                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
722                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
723                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
724                               #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
725                                ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
726                               #endif
727                               );CHKERRCUSPARSE(stat);
728 
729       /* assign the pointer. Is this really necessary? */
730       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
731 
732       /* allocate space for the triangular factor information */
733       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
734 
735       /* Create the matrix description */
736       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
737       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
738      #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
739       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
740      #else
741       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
742      #endif
743       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
744       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
745 
746       /* set the operation */
747       loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
748 
749       /* set the matrix */
750       loTriFactor->csrMat = new CsrMatrix;
751       loTriFactor->csrMat->num_rows = A->rmap->n;
752       loTriFactor->csrMat->num_cols = A->cmap->n;
753       loTriFactor->csrMat->num_entries = a->nz;
754 
755       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
756       loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
757 
758       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
759       loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
760 
761       loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
762       loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
763       ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
764 
765       /* Create the solve analysis information */
766       stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
767     #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
768       stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
769                                      loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
770                                      loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
771                                      loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
772                                      &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
773       cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
774     #endif
775 
776       /* perform the solve analysis */
777       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
778                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
779                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
780                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
781                               #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
782                                ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
783                               #endif
784                               );CHKERRCUSPARSE(stat);
785 
786       /* assign the pointer. Is this really necessary? */
787       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
788 
789       A->offloadmask = PETSC_OFFLOAD_BOTH;
790       cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
791       cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
792       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
793       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
794     } catch(char *ex) {
795       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
796     }
797   }
798   PetscFunctionReturn(0);
799 }
800 
801 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
802 {
803   PetscErrorCode               ierr;
804   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
805   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
806   IS                           ip = a->row;
807   const PetscInt               *rip;
808   PetscBool                    perm_identity;
809   PetscInt                     n = A->rmap->n;
810 
811   PetscFunctionBegin;
812   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
813   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
814   cusparseTriFactors->workVector = new THRUSTARRAY(n);
815   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
816 
817   /* lower triangular indices */
818   ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
819   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
820   if (!perm_identity) {
821     IS             iip;
822     const PetscInt *irip;
823 
824     ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr);
825     ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr);
826     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
827     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
828     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
829     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
830     ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr);
831     ierr = ISDestroy(&iip);CHKERRQ(ierr);
832     ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr);
833   }
834   ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
835   PetscFunctionReturn(0);
836 }
837 
838 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
839 {
840   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
841   IS             isrow = b->row,iscol = b->col;
842   PetscBool      row_identity,col_identity;
843   PetscErrorCode ierr;
844 
845   PetscFunctionBegin;
846   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
847   B->offloadmask = PETSC_OFFLOAD_CPU;
848   /* determine which version of MatSolve needs to be used. */
849   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
850   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
851   if (row_identity && col_identity) {
852     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
853     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
854     B->ops->matsolve = NULL;
855     B->ops->matsolvetranspose = NULL;
856   } else {
857     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
858     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
859     B->ops->matsolve = NULL;
860     B->ops->matsolvetranspose = NULL;
861   }
862 
863   /* get the triangular factors */
864   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
865   PetscFunctionReturn(0);
866 }
867 
868 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
869 {
870   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
871   IS             ip = b->row;
872   PetscBool      perm_identity;
873   PetscErrorCode ierr;
874 
875   PetscFunctionBegin;
876   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
877   B->offloadmask = PETSC_OFFLOAD_CPU;
878   /* determine which version of MatSolve needs to be used. */
879   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
880   if (perm_identity) {
881     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
882     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
883     B->ops->matsolve = NULL;
884     B->ops->matsolvetranspose = NULL;
885   } else {
886     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
887     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
888     B->ops->matsolve = NULL;
889     B->ops->matsolvetranspose = NULL;
890   }
891 
892   /* get the triangular factors */
893   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
894   PetscFunctionReturn(0);
895 }
896 
897 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
898 {
899   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
900   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
901   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
902   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
903   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
904   cusparseStatus_t                  stat;
905   cusparseIndexBase_t               indexBase;
906   cusparseMatrixType_t              matrixType;
907   cusparseFillMode_t                fillMode;
908   cusparseDiagType_t                diagType;
909   cudaError_t                       cerr;
910 
911   PetscFunctionBegin;
912 
913   /*********************************************/
914   /* Now the Transpose of the Lower Tri Factor */
915   /*********************************************/
916 
917   /* allocate space for the transpose of the lower triangular factor */
918   loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
919 
920   /* set the matrix descriptors of the lower triangular factor */
921   matrixType = cusparseGetMatType(loTriFactor->descr);
922   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
923   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
924     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
925   diagType = cusparseGetMatDiagType(loTriFactor->descr);
926 
927   /* Create the matrix description */
928   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
929   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
930   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
931   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
932   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
933 
934   /* set the operation */
935   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
936 
937   /* allocate GPU space for the CSC of the lower triangular factor*/
938   loTriFactorT->csrMat = new CsrMatrix;
939   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
940   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
941   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
942   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
943   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
944   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
945 
946   /* compute the transpose of the lower triangular factor, i.e. the CSC */
947 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
948   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
949                                        loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
950                                        loTriFactor->csrMat->values->data().get(),
951                                        loTriFactor->csrMat->row_offsets->data().get(),
952                                        loTriFactor->csrMat->column_indices->data().get(),
953                                        loTriFactorT->csrMat->values->data().get(),
954                                        loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
955                                        CUSPARSE_ACTION_NUMERIC,indexBase,
956                                        CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
957   cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
958 #endif
959 
960   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
961                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
962                           loTriFactor->csrMat->values->data().get(),
963                           loTriFactor->csrMat->row_offsets->data().get(),
964                           loTriFactor->csrMat->column_indices->data().get(),
965                           loTriFactorT->csrMat->values->data().get(),
966                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
967                           loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
968                           CUSPARSE_ACTION_NUMERIC, indexBase,
969                           CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer
970                         #else
971                           loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
972                           CUSPARSE_ACTION_NUMERIC, indexBase
973                         #endif
974                         );CHKERRCUSPARSE(stat);
975 
976   /* Create the solve analysis information */
977   stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
978 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
979   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
980                                 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
981                                 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
982                                 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
983                                 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
984   cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
985 #endif
986 
987   /* perform the solve analysis */
988   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
989                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
990                            loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
991                            loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo
992                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
993                            ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
994                           #endif
995                           );CHKERRCUSPARSE(stat);
996 
997   /* assign the pointer. Is this really necessary? */
998   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
999 
1000   /*********************************************/
1001   /* Now the Transpose of the Upper Tri Factor */
1002   /*********************************************/
1003 
1004   /* allocate space for the transpose of the upper triangular factor */
1005   upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
1006 
1007   /* set the matrix descriptors of the upper triangular factor */
1008   matrixType = cusparseGetMatType(upTriFactor->descr);
1009   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1010   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1011     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1012   diagType = cusparseGetMatDiagType(upTriFactor->descr);
1013 
1014   /* Create the matrix description */
1015   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
1016   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1017   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1018   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1019   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1020 
1021   /* set the operation */
1022   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1023 
1024   /* allocate GPU space for the CSC of the upper triangular factor*/
1025   upTriFactorT->csrMat = new CsrMatrix;
1026   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1027   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1028   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1029   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1030   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1031   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1032 
1033   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1034 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1035   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1036                                 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1037                                 upTriFactor->csrMat->values->data().get(),
1038                                 upTriFactor->csrMat->row_offsets->data().get(),
1039                                 upTriFactor->csrMat->column_indices->data().get(),
1040                                 upTriFactorT->csrMat->values->data().get(),
1041                                 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1042                                 CUSPARSE_ACTION_NUMERIC,indexBase,
1043                                 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1044   cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1045 #endif
1046 
1047   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1048                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1049                           upTriFactor->csrMat->values->data().get(),
1050                           upTriFactor->csrMat->row_offsets->data().get(),
1051                           upTriFactor->csrMat->column_indices->data().get(),
1052                           upTriFactorT->csrMat->values->data().get(),
1053                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1054                           upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1055                           CUSPARSE_ACTION_NUMERIC, indexBase,
1056                           CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer
1057                         #else
1058                           upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1059                           CUSPARSE_ACTION_NUMERIC, indexBase
1060                         #endif
1061                         );CHKERRCUSPARSE(stat);
1062 
1063   /* Create the solve analysis information */
1064   stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1065   #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1066   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1067                                  upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1068                                  upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1069                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
1070                                  &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1071   cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1072   #endif
1073 
1074   /* perform the solve analysis */
1075   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1076                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1077                            upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1078                            upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo
1079                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1080                            ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1081                           #endif
1082                           );CHKERRCUSPARSE(stat);
1083 
1084   /* assign the pointer. Is this really necessary? */
1085   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1086   PetscFunctionReturn(0);
1087 }
1088 
1089 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
1090 {
1091   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1092   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1093   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1094   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1095   cusparseStatus_t             stat;
1096   cusparseIndexBase_t          indexBase;
1097   cudaError_t                  err;
1098   PetscErrorCode               ierr;
1099 
1100   PetscFunctionBegin;
1101   if (!cusparsestruct->transgen || cusparsestruct->matTranspose) PetscFunctionReturn(0);
1102   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1103   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1104   /* create cusparse matrix */
1105   matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1106   stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
1107   indexBase = cusparseGetMatIndexBase(matstruct->descr);
1108   stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
1109   stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1110 
1111   /* set alpha and beta */
1112   err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1113   err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1114   err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1115   err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1116   err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1117   err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1118   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1119 
1120   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1121     CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
1122     CsrMatrix *matrixT= new CsrMatrix;
1123     matrixT->num_rows = A->cmap->n;
1124     matrixT->num_cols = A->rmap->n;
1125     matrixT->num_entries = a->nz;
1126     matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1127     matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1128     matrixT->values = new THRUSTARRAY(a->nz);
1129 
1130     cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
1131     cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
1132 
1133     /* compute the transpose, i.e. the CSC */
1134    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1135     stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1136                                   A->cmap->n, matrix->num_entries,
1137                                   matrix->values->data().get(),
1138                                   cusparsestruct->rowoffsets_gpu->data().get(),
1139                                   matrix->column_indices->data().get(),
1140                                   matrixT->values->data().get(),
1141                                   matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1142                                   CUSPARSE_ACTION_NUMERIC,indexBase,
1143                                   cusparsestruct->csr2cscAlg, &cusparsestruct->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1144     err = cudaMalloc(&cusparsestruct->csr2cscBuffer,cusparsestruct->csr2cscBufferSize);CHKERRCUDA(err);
1145    #endif
1146 
1147     stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1148                             A->cmap->n, matrix->num_entries,
1149                             matrix->values->data().get(),
1150                             cusparsestruct->rowoffsets_gpu->data().get(),
1151                             matrix->column_indices->data().get(),
1152                             matrixT->values->data().get(),
1153                           #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1154                             matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1155                             CUSPARSE_ACTION_NUMERIC,indexBase,
1156                             cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer
1157                           #else
1158                             matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1159                             CUSPARSE_ACTION_NUMERIC, indexBase
1160                           #endif
1161                            );CHKERRCUSPARSE(stat);
1162     matstructT->mat = matrixT;
1163 
1164    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1165     stat = cusparseCreateCsr(&matstructT->matDescr,
1166                              matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1167                              matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1168                              matrixT->values->data().get(),
1169                              CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1170                              indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
1171    #endif
1172   } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1173    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1174     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1175    #else
1176     CsrMatrix *temp  = new CsrMatrix;
1177     CsrMatrix *tempT = new CsrMatrix;
1178     /* First convert HYB to CSR */
1179     temp->num_rows = A->rmap->n;
1180     temp->num_cols = A->cmap->n;
1181     temp->num_entries = a->nz;
1182     temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1183     temp->column_indices = new THRUSTINTARRAY32(a->nz);
1184     temp->values = new THRUSTARRAY(a->nz);
1185 
1186     stat = cusparse_hyb2csr(cusparsestruct->handle,
1187                             matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1188                             temp->values->data().get(),
1189                             temp->row_offsets->data().get(),
1190                             temp->column_indices->data().get());CHKERRCUSPARSE(stat);
1191 
1192     /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1193     tempT->num_rows = A->rmap->n;
1194     tempT->num_cols = A->cmap->n;
1195     tempT->num_entries = a->nz;
1196     tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1197     tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1198     tempT->values = new THRUSTARRAY(a->nz);
1199 
1200     stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1201                             temp->num_cols, temp->num_entries,
1202                             temp->values->data().get(),
1203                             temp->row_offsets->data().get(),
1204                             temp->column_indices->data().get(),
1205                             tempT->values->data().get(),
1206                             tempT->column_indices->data().get(),
1207                             tempT->row_offsets->data().get(),
1208                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1209 
1210     /* Last, convert CSC to HYB */
1211     cusparseHybMat_t hybMat;
1212     stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1213     cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1214       CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1215     stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1216                             matstructT->descr, tempT->values->data().get(),
1217                             tempT->row_offsets->data().get(),
1218                             tempT->column_indices->data().get(),
1219                             hybMat, 0, partition);CHKERRCUSPARSE(stat);
1220 
1221     /* assign the pointer */
1222     matstructT->mat = hybMat;
1223     /* delete temporaries */
1224     if (tempT) {
1225       if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1226       if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1227       if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1228       delete (CsrMatrix*) tempT;
1229     }
1230     if (temp) {
1231       if (temp->values) delete (THRUSTARRAY*) temp->values;
1232       if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1233       if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1234       delete (CsrMatrix*) temp;
1235     }
1236    #endif
1237   }
1238   err  = WaitForCUDA();CHKERRCUDA(err);
1239   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1240   ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1241   /* the compressed row indices is not used for matTranspose */
1242   matstructT->cprowIndices = NULL;
1243   /* assign the pointer */
1244   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1245   PetscFunctionReturn(0);
1246 }
1247 
1248 /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1249 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1250 {
1251   PetscInt                              n = xx->map->n;
1252   const PetscScalar                     *barray;
1253   PetscScalar                           *xarray;
1254   thrust::device_ptr<const PetscScalar> bGPU;
1255   thrust::device_ptr<PetscScalar>       xGPU;
1256   cusparseStatus_t                      stat;
1257   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1258   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1259   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1260   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1261   PetscErrorCode                        ierr;
1262   cudaError_t                           cerr;
1263 
1264   PetscFunctionBegin;
1265   /* Analyze the matrix and create the transpose ... on the fly */
1266   if (!loTriFactorT && !upTriFactorT) {
1267     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1268     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1269     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1270   }
1271 
1272   /* Get the GPU pointers */
1273   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1274   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1275   xGPU = thrust::device_pointer_cast(xarray);
1276   bGPU = thrust::device_pointer_cast(barray);
1277 
1278   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1279   /* First, reorder with the row permutation */
1280   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1281                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1282                xGPU);
1283 
1284   /* First, solve U */
1285   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1286                         upTriFactorT->csrMat->num_rows,
1287                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1288                         upTriFactorT->csrMat->num_entries,
1289                       #endif
1290                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1291                         upTriFactorT->csrMat->values->data().get(),
1292                         upTriFactorT->csrMat->row_offsets->data().get(),
1293                         upTriFactorT->csrMat->column_indices->data().get(),
1294                         upTriFactorT->solveInfo,
1295                         xarray, tempGPU->data().get()
1296                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1297                         ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1298                       #endif
1299                         );CHKERRCUSPARSE(stat);
1300 
1301   /* Then, solve L */
1302   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1303                         loTriFactorT->csrMat->num_rows,
1304                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1305                         loTriFactorT->csrMat->num_entries,
1306                       #endif
1307                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1308                         loTriFactorT->csrMat->values->data().get(),
1309                         loTriFactorT->csrMat->row_offsets->data().get(),
1310                         loTriFactorT->csrMat->column_indices->data().get(),
1311                         loTriFactorT->solveInfo,
1312                         tempGPU->data().get(), xarray
1313                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1314                         ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1315                       #endif
1316                         );CHKERRCUSPARSE(stat);
1317 
1318   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1319   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1320                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1321                tempGPU->begin());
1322 
1323   /* Copy the temporary to the full solution. */
1324   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1325 
1326   /* restore */
1327   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1328   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1329   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1330   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1331   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1332   PetscFunctionReturn(0);
1333 }
1334 
1335 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1336 {
1337   const PetscScalar                 *barray;
1338   PetscScalar                       *xarray;
1339   cusparseStatus_t                  stat;
1340   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1341   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1342   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1343   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1344   PetscErrorCode                    ierr;
1345   cudaError_t                       cerr;
1346 
1347   PetscFunctionBegin;
1348   /* Analyze the matrix and create the transpose ... on the fly */
1349   if (!loTriFactorT && !upTriFactorT) {
1350     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1351     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1352     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1353   }
1354 
1355   /* Get the GPU pointers */
1356   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1357   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1358 
1359   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1360   /* First, solve U */
1361   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1362                         upTriFactorT->csrMat->num_rows,
1363                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1364                         upTriFactorT->csrMat->num_entries,
1365                       #endif
1366                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1367                         upTriFactorT->csrMat->values->data().get(),
1368                         upTriFactorT->csrMat->row_offsets->data().get(),
1369                         upTriFactorT->csrMat->column_indices->data().get(),
1370                         upTriFactorT->solveInfo,
1371                         barray, tempGPU->data().get()
1372                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1373                         ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1374                       #endif
1375                         );CHKERRCUSPARSE(stat);
1376 
1377   /* Then, solve L */
1378   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1379                         loTriFactorT->csrMat->num_rows,
1380                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1381                         loTriFactorT->csrMat->num_entries,
1382                       #endif
1383                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1384                         loTriFactorT->csrMat->values->data().get(),
1385                         loTriFactorT->csrMat->row_offsets->data().get(),
1386                         loTriFactorT->csrMat->column_indices->data().get(),
1387                         loTriFactorT->solveInfo,
1388                         tempGPU->data().get(), xarray
1389                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1390                         ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1391                       #endif
1392                         );CHKERRCUSPARSE(stat);
1393 
1394   /* restore */
1395   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1396   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1397   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1398   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1399   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1400   PetscFunctionReturn(0);
1401 }
1402 
1403 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1404 {
1405   const PetscScalar                     *barray;
1406   PetscScalar                           *xarray;
1407   thrust::device_ptr<const PetscScalar> bGPU;
1408   thrust::device_ptr<PetscScalar>       xGPU;
1409   cusparseStatus_t                      stat;
1410   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1411   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1412   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1413   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1414   PetscErrorCode                        ierr;
1415   cudaError_t                           cerr;
1416 
1417   PetscFunctionBegin;
1418 
1419   /* Get the GPU pointers */
1420   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1421   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1422   xGPU = thrust::device_pointer_cast(xarray);
1423   bGPU = thrust::device_pointer_cast(barray);
1424 
1425   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1426   /* First, reorder with the row permutation */
1427   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1428                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1429                tempGPU->begin());
1430 
1431   /* Next, solve L */
1432   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1433                         loTriFactor->csrMat->num_rows,
1434                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1435                         loTriFactor->csrMat->num_entries,
1436                       #endif
1437                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1438                         loTriFactor->csrMat->values->data().get(),
1439                         loTriFactor->csrMat->row_offsets->data().get(),
1440                         loTriFactor->csrMat->column_indices->data().get(),
1441                         loTriFactor->solveInfo,
1442                         tempGPU->data().get(), xarray
1443                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1444                         ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1445                       #endif
1446                         );CHKERRCUSPARSE(stat);
1447 
1448   /* Then, solve U */
1449   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1450                         upTriFactor->csrMat->num_rows,
1451                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1452                         upTriFactor->csrMat->num_entries,
1453                       #endif
1454                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1455                         upTriFactor->csrMat->values->data().get(),
1456                         upTriFactor->csrMat->row_offsets->data().get(),
1457                         upTriFactor->csrMat->column_indices->data().get(),
1458                         upTriFactor->solveInfo,
1459                         xarray, tempGPU->data().get()
1460                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1461                         ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1462                       #endif
1463                         );CHKERRCUSPARSE(stat);
1464 
1465   /* Last, reorder with the column permutation */
1466   thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1467                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
1468                xGPU);
1469 
1470   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1471   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1472   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1473   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1474   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1475   PetscFunctionReturn(0);
1476 }
1477 
1478 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1479 {
1480   const PetscScalar                 *barray;
1481   PetscScalar                       *xarray;
1482   cusparseStatus_t                  stat;
1483   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1484   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1485   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1486   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1487   PetscErrorCode                    ierr;
1488   cudaError_t                       cerr;
1489 
1490   PetscFunctionBegin;
1491   /* Get the GPU pointers */
1492   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1493   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1494 
1495   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1496   /* First, solve L */
1497   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1498                         loTriFactor->csrMat->num_rows,
1499                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1500                         loTriFactor->csrMat->num_entries,
1501                       #endif
1502                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1503                         loTriFactor->csrMat->values->data().get(),
1504                         loTriFactor->csrMat->row_offsets->data().get(),
1505                         loTriFactor->csrMat->column_indices->data().get(),
1506                         loTriFactor->solveInfo,
1507                         barray, tempGPU->data().get()
1508                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1509                         ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1510                       #endif
1511                         );CHKERRCUSPARSE(stat);
1512 
1513   /* Next, solve U */
1514   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1515                         upTriFactor->csrMat->num_rows,
1516                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1517                         upTriFactor->csrMat->num_entries,
1518                       #endif
1519                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1520                         upTriFactor->csrMat->values->data().get(),
1521                         upTriFactor->csrMat->row_offsets->data().get(),
1522                         upTriFactor->csrMat->column_indices->data().get(),
1523                         upTriFactor->solveInfo,
1524                         tempGPU->data().get(), xarray
1525                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1526                         ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1527                       #endif
1528                         );CHKERRCUSPARSE(stat);
1529 
1530   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1531   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1532   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1533   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1534   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1535   PetscFunctionReturn(0);
1536 }
1537 
1538 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
1539 {
1540   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1541   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1542   cudaError_t        cerr;
1543   PetscErrorCode     ierr;
1544 
1545   PetscFunctionBegin;
1546   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
1547     CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat;
1548 
1549     ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
1550     cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1551     cerr = WaitForCUDA();CHKERRCUDA(cerr);
1552     ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr);
1553     ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
1554     A->offloadmask = PETSC_OFFLOAD_BOTH;
1555   }
1556   PetscFunctionReturn(0);
1557 }
1558 
1559 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1560 {
1561   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
1562   PetscErrorCode ierr;
1563 
1564   PetscFunctionBegin;
1565   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
1566   *array = a->a;
1567   A->offloadmask = PETSC_OFFLOAD_CPU;
1568   PetscFunctionReturn(0);
1569 }
1570 
1571 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1572 {
1573   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1574   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
1575   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1576   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
1577   PetscErrorCode               ierr;
1578   cusparseStatus_t             stat;
1579   cudaError_t                  err;
1580 
1581   PetscFunctionBegin;
1582   if (A->boundtocpu) PetscFunctionReturn(0);
1583   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1584     if (A->was_assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) {
1585       /* Copy values only */
1586       CsrMatrix *matrix,*matrixT;
1587       matrix = (CsrMatrix*)cusparsestruct->mat->mat;
1588 
1589       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1590       matrix->values->assign(a->a, a->a+a->nz);
1591       err  = WaitForCUDA();CHKERRCUDA(err);
1592       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
1593       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1594 
1595       /* Update matT when it was built before */
1596       if (cusparsestruct->matTranspose) {
1597         cusparseIndexBase_t indexBase = cusparseGetMatIndexBase(cusparsestruct->mat->descr);
1598         matrixT = (CsrMatrix*)cusparsestruct->matTranspose->mat;
1599         ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1600         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1601                             A->cmap->n, matrix->num_entries,
1602                             matrix->values->data().get(),
1603                             cusparsestruct->rowoffsets_gpu->data().get(),
1604                             matrix->column_indices->data().get(),
1605                             matrixT->values->data().get(),
1606                           #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1607                             matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1608                             CUSPARSE_ACTION_NUMERIC,indexBase,
1609                             cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer
1610                           #else
1611                             matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1612                             CUSPARSE_ACTION_NUMERIC, indexBase
1613                           #endif
1614                            );CHKERRCUSPARSE(stat);
1615         err  = WaitForCUDA();CHKERRCUDA(err);
1616         ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1617       }
1618     } else {
1619       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1620       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr);
1621       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->matTranspose,cusparsestruct->format);CHKERRQ(ierr);
1622       delete cusparsestruct->workVector;
1623       delete cusparsestruct->rowoffsets_gpu;
1624       try {
1625         if (a->compressedrow.use) {
1626           m    = a->compressedrow.nrows;
1627           ii   = a->compressedrow.i;
1628           ridx = a->compressedrow.rindex;
1629         } else {
1630           m    = A->rmap->n;
1631           ii   = a->i;
1632           ridx = NULL;
1633         }
1634         cusparsestruct->nrows = m;
1635 
1636         /* create cusparse matrix */
1637         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1638         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
1639         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
1640         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1641 
1642         err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1643         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1644         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1645         err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1646         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1647         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1648         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1649 
1650         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1651         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1652           /* set the matrix */
1653           CsrMatrix *mat= new CsrMatrix;
1654           mat->num_rows = m;
1655           mat->num_cols = A->cmap->n;
1656           mat->num_entries = a->nz;
1657           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1658           mat->row_offsets->assign(ii, ii + m+1);
1659 
1660           mat->column_indices = new THRUSTINTARRAY32(a->nz);
1661           mat->column_indices->assign(a->j, a->j+a->nz);
1662 
1663           mat->values = new THRUSTARRAY(a->nz);
1664           mat->values->assign(a->a, a->a+a->nz);
1665 
1666           /* assign the pointer */
1667           matstruct->mat = mat;
1668          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1669           if (mat->num_rows) { /* cusparse errors on empty matrices! */
1670             stat = cusparseCreateCsr(&matstruct->matDescr,
1671                                     mat->num_rows, mat->num_cols, mat->num_entries,
1672                                     mat->row_offsets->data().get(), mat->column_indices->data().get(),
1673                                     mat->values->data().get(),
1674                                     CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1675                                     CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
1676           }
1677          #endif
1678         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1679          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1680           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1681          #else
1682           CsrMatrix *mat= new CsrMatrix;
1683           mat->num_rows = m;
1684           mat->num_cols = A->cmap->n;
1685           mat->num_entries = a->nz;
1686           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1687           mat->row_offsets->assign(ii, ii + m+1);
1688 
1689           mat->column_indices = new THRUSTINTARRAY32(a->nz);
1690           mat->column_indices->assign(a->j, a->j+a->nz);
1691 
1692           mat->values = new THRUSTARRAY(a->nz);
1693           mat->values->assign(a->a, a->a+a->nz);
1694 
1695           cusparseHybMat_t hybMat;
1696           stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1697           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1698             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1699           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
1700               matstruct->descr, mat->values->data().get(),
1701               mat->row_offsets->data().get(),
1702               mat->column_indices->data().get(),
1703               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1704           /* assign the pointer */
1705           matstruct->mat = hybMat;
1706 
1707           if (mat) {
1708             if (mat->values) delete (THRUSTARRAY*)mat->values;
1709             if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1710             if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1711             delete (CsrMatrix*)mat;
1712           }
1713          #endif
1714         }
1715 
1716         /* assign the compressed row indices */
1717         if (a->compressedrow.use) {
1718           cusparsestruct->workVector = new THRUSTARRAY(m);
1719           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
1720           matstruct->cprowIndices->assign(ridx,ridx+m);
1721           tmp = m;
1722         } else {
1723           cusparsestruct->workVector = NULL;
1724           matstruct->cprowIndices    = NULL;
1725           tmp = 0;
1726         }
1727         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
1728 
1729         /* assign the pointer */
1730         cusparsestruct->mat = matstruct;
1731       } catch(char *ex) {
1732         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1733       }
1734       err  = WaitForCUDA();CHKERRCUDA(err);
1735       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1736       cusparsestruct->nonzerostate = A->nonzerostate;
1737     }
1738     A->offloadmask = PETSC_OFFLOAD_BOTH;
1739   }
1740   PetscFunctionReturn(0);
1741 }
1742 
1743 struct VecCUDAPlusEquals
1744 {
1745   template <typename Tuple>
1746   __host__ __device__
1747   void operator()(Tuple t)
1748   {
1749     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1750   }
1751 };
1752 
1753 struct VecCUDAEquals
1754 {
1755   template <typename Tuple>
1756   __host__ __device__
1757   void operator()(Tuple t)
1758   {
1759     thrust::get<1>(t) = thrust::get<0>(t);
1760   }
1761 };
1762 
1763 struct VecCUDAEqualsReverse
1764 {
1765   template <typename Tuple>
1766   __host__ __device__
1767   void operator()(Tuple t)
1768   {
1769     thrust::get<0>(t) = thrust::get<1>(t);
1770   }
1771 };
1772 
1773 struct MatMatCusparse {
1774   PetscBool            cisdense;
1775   PetscScalar          *Bt;
1776   Mat                  X;
1777 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1778   PetscBool            initialized;   /* C = alpha op(A) op(B) + beta C */
1779   cusparseDnMatDescr_t matBDescr;
1780   cusparseDnMatDescr_t matCDescr;
1781   size_t               spmmBufferSize;
1782   void                 *spmmBuffer;
1783   PetscInt             Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
1784 #endif
1785 };
1786 
1787 static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
1788 {
1789   PetscErrorCode ierr;
1790   MatMatCusparse *mmdata = (MatMatCusparse *)data;
1791   cudaError_t    cerr;
1792 
1793   PetscFunctionBegin;
1794   cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr);
1795  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1796   cusparseStatus_t stat;
1797   if (mmdata->matBDescr)  {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat);}
1798   if (mmdata->matCDescr)  {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat);}
1799   if (mmdata->spmmBuffer) {cerr = cudaFree(mmdata->spmmBuffer);CHKERRCUDA(cerr);}
1800  #endif
1801   ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr);
1802   ierr = PetscFree(data);CHKERRQ(ierr);
1803   PetscFunctionReturn(0);
1804 }
1805 
1806 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
1807 
1808 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1809 {
1810   Mat_Product                  *product = C->product;
1811   Mat                          A,B;
1812   PetscInt                     m,n,blda,clda;
1813   PetscBool                    flg,biscuda;
1814   Mat_SeqAIJCUSPARSE           *cusp;
1815   cusparseStatus_t             stat;
1816   cusparseOperation_t          opA;
1817   const PetscScalar            *barray;
1818   PetscScalar                  *carray;
1819   PetscErrorCode               ierr;
1820   MatMatCusparse               *mmdata;
1821   Mat_SeqAIJCUSPARSEMultStruct *mat;
1822   CsrMatrix                    *csrmat;
1823   cudaError_t                  cerr;
1824 
1825   PetscFunctionBegin;
1826   MatCheckProduct(C,1);
1827   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
1828   mmdata = (MatMatCusparse*)product->data;
1829   A    = product->A;
1830   B    = product->B;
1831   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
1832   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
1833   /* currently CopyToGpu does not copy if the matrix is bound to CPU
1834      Instead of silently accepting the wrong answer, I prefer to raise the error */
1835   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
1836   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1837   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1838   switch (product->type) {
1839   case MATPRODUCT_AB:
1840   case MATPRODUCT_PtAP:
1841     mat = cusp->mat;
1842     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1843     m   = A->rmap->n;
1844     n   = B->cmap->n;
1845     break;
1846   case MATPRODUCT_AtB:
1847     if (!cusp->transgen) {
1848       mat = cusp->mat;
1849       opA = CUSPARSE_OPERATION_TRANSPOSE;
1850     } else {
1851       ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1852       mat  = cusp->matTranspose;
1853       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
1854     }
1855     m = A->cmap->n;
1856     n = B->cmap->n;
1857     break;
1858   case MATPRODUCT_ABt:
1859   case MATPRODUCT_RARt:
1860     mat = cusp->mat;
1861     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1862     m   = A->rmap->n;
1863     n   = B->rmap->n;
1864     break;
1865   default:
1866     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
1867   }
1868   if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct");
1869   csrmat = (CsrMatrix*)mat->mat;
1870   /* if the user passed a CPU matrix, copy the data to the GPU */
1871   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr);
1872   if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);}
1873   ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr);
1874 
1875   ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr);
1876   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
1877     ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
1878     ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr);
1879   } else {
1880     ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr);
1881     ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr);
1882   }
1883 
1884   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1885  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1886   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
1887   /* (re)allcoate spmmBuffer if not initialized or LDAs are different */
1888   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
1889     if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;}
1890     if (!mmdata->matBDescr) {
1891       stat         = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
1892       mmdata->Blda = blda;
1893     }
1894 
1895     if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;}
1896     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
1897       stat         = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
1898       mmdata->Clda = clda;
1899     }
1900 
1901     if (!mat->matDescr) {
1902       stat = cusparseCreateCsr(&mat->matDescr,
1903                               csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
1904                               csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
1905                               csrmat->values->data().get(),
1906                               CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1907                               CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
1908     }
1909     stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
1910                                    mat->matDescr,mmdata->matBDescr,mat->beta_zero,
1911                                    mmdata->matCDescr,cusparse_scalartype,
1912                                    cusp->spmmAlg,&mmdata->spmmBufferSize);CHKERRCUSPARSE(stat);
1913     if (mmdata->spmmBuffer) {cerr = cudaFree(mmdata->spmmBuffer);CHKERRCUDA(cerr);}
1914     cerr = cudaMalloc(&mmdata->spmmBuffer,mmdata->spmmBufferSize);CHKERRCUDA(cerr);
1915     mmdata->initialized = PETSC_TRUE;
1916   } else {
1917     /* to be safe, always update pointers of the mats */
1918     stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat);
1919     stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat);
1920     stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat);
1921   }
1922 
1923   /* do cusparseSpMM, which supports transpose on B */
1924   stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
1925                       mat->matDescr,mmdata->matBDescr,mat->beta_zero,
1926                       mmdata->matCDescr,cusparse_scalartype,
1927                       cusp->spmmAlg,mmdata->spmmBuffer);CHKERRCUSPARSE(stat);
1928  #else
1929   PetscInt k;
1930   /* cusparseXcsrmm does not support transpose on B */
1931   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
1932     cublasHandle_t cublasv2handle;
1933     cublasStatus_t cerr;
1934 
1935     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
1936     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
1937                        B->cmap->n,B->rmap->n,
1938                        &PETSC_CUSPARSE_ONE ,barray,blda,
1939                        &PETSC_CUSPARSE_ZERO,barray,blda,
1940                        mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr);
1941     blda = B->cmap->n;
1942     k    = B->cmap->n;
1943   } else {
1944     k    = B->rmap->n;
1945   }
1946 
1947   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
1948   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
1949                            csrmat->num_entries,mat->alpha_one,mat->descr,
1950                            csrmat->values->data().get(),
1951                            csrmat->row_offsets->data().get(),
1952                            csrmat->column_indices->data().get(),
1953                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
1954                            carray,clda);CHKERRCUSPARSE(stat);
1955  #endif
1956   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1957   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1958   ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr);
1959   ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr);
1960   if (product->type == MATPRODUCT_RARt) {
1961     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
1962     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
1963   } else if (product->type == MATPRODUCT_PtAP) {
1964     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
1965     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
1966   } else {
1967     ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr);
1968   }
1969   if (mmdata->cisdense) {
1970     ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr);
1971   }
1972   if (!biscuda) {
1973     ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
1974   }
1975   PetscFunctionReturn(0);
1976 }
1977 
1978 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1979 {
1980   Mat_Product        *product = C->product;
1981   Mat                A,B;
1982   PetscInt           m,n;
1983   PetscBool          cisdense,flg;
1984   PetscErrorCode     ierr;
1985   MatMatCusparse     *mmdata;
1986   Mat_SeqAIJCUSPARSE *cusp;
1987 
1988   PetscFunctionBegin;
1989   MatCheckProduct(C,1);
1990   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
1991   A    = product->A;
1992   B    = product->B;
1993   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
1994   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
1995   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1996   if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
1997   switch (product->type) {
1998   case MATPRODUCT_AB:
1999     m = A->rmap->n;
2000     n = B->cmap->n;
2001     break;
2002   case MATPRODUCT_AtB:
2003     m = A->cmap->n;
2004     n = B->cmap->n;
2005     break;
2006   case MATPRODUCT_ABt:
2007     m = A->rmap->n;
2008     n = B->rmap->n;
2009     break;
2010   case MATPRODUCT_PtAP:
2011     m = B->cmap->n;
2012     n = B->cmap->n;
2013     break;
2014   case MATPRODUCT_RARt:
2015     m = B->rmap->n;
2016     n = B->rmap->n;
2017     break;
2018   default:
2019     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2020   }
2021   ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2022   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2023   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr);
2024   ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr);
2025 
2026   /* product data */
2027   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2028   mmdata->cisdense = cisdense;
2029  #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
2030   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2031   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2032     cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
2033   }
2034  #endif
2035   /* for these products we need intermediate storage */
2036   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2037     ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr);
2038     ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr);
2039     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2040       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr);
2041     } else {
2042       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr);
2043     }
2044   }
2045   C->product->data    = mmdata;
2046   C->product->destroy = MatDestroy_MatMatCusparse;
2047 
2048   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2049   PetscFunctionReturn(0);
2050 }
2051 
2052 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
2053 
2054 /* handles dense B */
2055 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat C)
2056 {
2057   Mat_Product    *product = C->product;
2058   PetscErrorCode ierr;
2059 
2060   PetscFunctionBegin;
2061   MatCheckProduct(C,1);
2062   if (!product->A) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing A");
2063   if (product->A->boundtocpu) {
2064     ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(C);CHKERRQ(ierr);
2065     PetscFunctionReturn(0);
2066   }
2067   switch (product->type) {
2068   case MATPRODUCT_AB:
2069   case MATPRODUCT_AtB:
2070   case MATPRODUCT_ABt:
2071   case MATPRODUCT_PtAP:
2072   case MATPRODUCT_RARt:
2073     C->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2074   default:
2075     break;
2076   }
2077   PetscFunctionReturn(0);
2078 }
2079 
2080 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2081 {
2082   PetscErrorCode ierr;
2083 
2084   PetscFunctionBegin;
2085   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2086   PetscFunctionReturn(0);
2087 }
2088 
2089 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
2090 {
2091   PetscErrorCode ierr;
2092 
2093   PetscFunctionBegin;
2094   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2095   PetscFunctionReturn(0);
2096 }
2097 
2098 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2099 {
2100   PetscErrorCode ierr;
2101 
2102   PetscFunctionBegin;
2103   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
2104   PetscFunctionReturn(0);
2105 }
2106 
2107 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2108 {
2109   PetscErrorCode ierr;
2110 
2111   PetscFunctionBegin;
2112   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
2113   PetscFunctionReturn(0);
2114 }
2115 
2116 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2117 {
2118   PetscErrorCode ierr;
2119 
2120   PetscFunctionBegin;
2121   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2122   PetscFunctionReturn(0);
2123 }
2124 
2125 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
2126 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
2127 {
2128   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
2129   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
2130   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
2131   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
2132   PetscErrorCode               ierr;
2133   cudaError_t                  cerr;
2134   cusparseStatus_t             stat;
2135   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2136   PetscBool                    compressed;
2137 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2138   PetscInt                     nx,ny;
2139 #endif
2140 
2141   PetscFunctionBegin;
2142   if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Hermitian and not transpose not supported");
2143   if (!a->nonzerorowcnt) {
2144     if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);}
2145     else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);}
2146     PetscFunctionReturn(0);
2147   }
2148   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
2149   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2150   if (!trans) {
2151     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2152     if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
2153   } else {
2154     if (herm || !cusparsestruct->transgen) {
2155       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
2156       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2157     } else {
2158       if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);}
2159       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
2160     }
2161   }
2162   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
2163   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
2164 
2165   try {
2166     ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
2167     if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */
2168     else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */
2169 
2170     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2171     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2172       /* z = A x + beta y.
2173          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
2174          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
2175       */
2176       xptr = xarray;
2177       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
2178       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
2179      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2180       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
2181           allocated to accommodate different uses. So we get the length info directly from mat.
2182        */
2183       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2184         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2185         nx = mat->num_cols;
2186         ny = mat->num_rows;
2187       }
2188      #endif
2189     } else {
2190       /* z = A^T x + beta y
2191          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
2192          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
2193        */
2194       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
2195       dptr = zarray;
2196       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
2197       if (compressed) { /* Scatter x to work vector */
2198         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
2199         thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
2200                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2201                          VecCUDAEqualsReverse());
2202       }
2203      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2204       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2205         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2206         nx = mat->num_rows;
2207         ny = mat->num_cols;
2208       }
2209      #endif
2210     }
2211 
2212     /* csr_spmv does y = alpha op(A) x + beta y */
2213     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2214      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2215       if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
2216       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
2217         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2218         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2219         stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
2220                                 matstruct->matDescr,
2221                                 matstruct->cuSpMV[opA].vecXDescr, beta,
2222                                 matstruct->cuSpMV[opA].vecYDescr,
2223                                 cusparse_scalartype,
2224                                 cusparsestruct->spmvAlg,
2225                                 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat);
2226         cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr);
2227 
2228         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
2229       } else {
2230         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
2231         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat);
2232         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat);
2233       }
2234 
2235       stat = cusparseSpMV(cusparsestruct->handle, opA,
2236                                matstruct->alpha_one,
2237                                matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEGenerateTransposeForMult() */
2238                                matstruct->cuSpMV[opA].vecXDescr,
2239                                beta,
2240                                matstruct->cuSpMV[opA].vecYDescr,
2241                                cusparse_scalartype,
2242                                cusparsestruct->spmvAlg,
2243                                matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat);
2244      #else
2245       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2246       stat = cusparse_csr_spmv(cusparsestruct->handle, opA,
2247                                mat->num_rows, mat->num_cols,
2248                                mat->num_entries, matstruct->alpha_one, matstruct->descr,
2249                                mat->values->data().get(), mat->row_offsets->data().get(),
2250                                mat->column_indices->data().get(), xptr, beta,
2251                                dptr);CHKERRCUSPARSE(stat);
2252      #endif
2253     } else {
2254       if (cusparsestruct->nrows) {
2255        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2256         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2257        #else
2258         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
2259         stat = cusparse_hyb_spmv(cusparsestruct->handle, opA,
2260                                  matstruct->alpha_one, matstruct->descr, hybMat,
2261                                  xptr, beta,
2262                                  dptr);CHKERRCUSPARSE(stat);
2263        #endif
2264       }
2265     }
2266     cerr = WaitForCUDA();CHKERRCUDA(cerr);
2267     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2268 
2269     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2270       if (yy) { /* MatMultAdd: zz = A*xx + yy */
2271         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
2272           ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */
2273         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
2274           ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
2275         }
2276       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
2277         ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
2278       }
2279 
2280       /* ScatterAdd the result from work vector into the full vector when A is compressed */
2281       if (compressed) {
2282         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
2283         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2284         thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
2285                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2286                          VecCUDAPlusEquals());
2287         cerr = WaitForCUDA();CHKERRCUDA(cerr);
2288         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2289       }
2290     } else {
2291       if (yy && yy != zz) {
2292         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
2293       }
2294     }
2295     ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
2296     if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);}
2297     else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);}
2298   } catch(char *ex) {
2299     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
2300   }
2301   if (yy) {
2302     ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
2303   } else {
2304     ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr);
2305   }
2306   PetscFunctionReturn(0);
2307 }
2308 
2309 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2310 {
2311   PetscErrorCode ierr;
2312 
2313   PetscFunctionBegin;
2314   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2315   PetscFunctionReturn(0);
2316 }
2317 
2318 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
2319 {
2320   PetscErrorCode              ierr;
2321   PetscSplitCSRDataStructure  *d_mat = NULL, h_mat;
2322   PetscBool                   is_seq = PETSC_TRUE;
2323   PetscInt                    nnz_state = A->nonzerostate;
2324   PetscFunctionBegin;
2325   if (A->factortype == MAT_FACTOR_NONE) {
2326     d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
2327   }
2328   if (d_mat) {
2329     cudaError_t err;
2330     ierr = PetscInfo(A,"Assemble device matrix\n");CHKERRQ(ierr);
2331     err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
2332     nnz_state = h_mat.nonzerostate;
2333     is_seq = h_mat.seq;
2334   }
2335   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); // this does very little if assembled on GPU - call it?
2336   if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0);
2337   if (A->factortype == MAT_FACTOR_NONE && A->nonzerostate >= nnz_state && is_seq) { // assembled on CPU eventhough equiped for GPU
2338     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2339   } else if (nnz_state > A->nonzerostate) {
2340     A->offloadmask = PETSC_OFFLOAD_GPU;
2341   }
2342 
2343   PetscFunctionReturn(0);
2344 }
2345 
2346 /* --------------------------------------------------------------------------------*/
2347 /*@
2348    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
2349    (the default parallel PETSc format). This matrix will ultimately pushed down
2350    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
2351    assembly performance the user should preallocate the matrix storage by setting
2352    the parameter nz (or the array nnz).  By setting these parameters accurately,
2353    performance during matrix assembly can be increased by more than a factor of 50.
2354 
2355    Collective
2356 
2357    Input Parameters:
2358 +  comm - MPI communicator, set to PETSC_COMM_SELF
2359 .  m - number of rows
2360 .  n - number of columns
2361 .  nz - number of nonzeros per row (same for all rows)
2362 -  nnz - array containing the number of nonzeros in the various rows
2363          (possibly different for each row) or NULL
2364 
2365    Output Parameter:
2366 .  A - the matrix
2367 
2368    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
2369    MatXXXXSetPreallocation() paradgm instead of this routine directly.
2370    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
2371 
2372    Notes:
2373    If nnz is given then nz is ignored
2374 
2375    The AIJ format (also called the Yale sparse matrix format or
2376    compressed row storage), is fully compatible with standard Fortran 77
2377    storage.  That is, the stored row and column indices can begin at
2378    either one (as in Fortran) or zero.  See the users' manual for details.
2379 
2380    Specify the preallocated storage with either nz or nnz (not both).
2381    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
2382    allocation.  For large problems you MUST preallocate memory or you
2383    will get TERRIBLE performance, see the users' manual chapter on matrices.
2384 
2385    By default, this format uses inodes (identical nodes) when possible, to
2386    improve numerical efficiency of matrix-vector products and solves. We
2387    search for consecutive rows with the same nonzero structure, thereby
2388    reusing matrix information to achieve increased efficiency.
2389 
2390    Level: intermediate
2391 
2392 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
2393 @*/
2394 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
2395 {
2396   PetscErrorCode ierr;
2397 
2398   PetscFunctionBegin;
2399   ierr = MatCreate(comm,A);CHKERRQ(ierr);
2400   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
2401   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2402   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
2403   PetscFunctionReturn(0);
2404 }
2405 
2406 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
2407 {
2408   PetscErrorCode              ierr;
2409   PetscSplitCSRDataStructure  *d_mat = NULL;
2410 
2411   PetscFunctionBegin;
2412   if (A->factortype == MAT_FACTOR_NONE) {
2413     d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
2414     ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat = NULL;
2415     ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
2416   } else {
2417     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
2418   }
2419   if (d_mat) {
2420     Mat_SeqAIJ                 *a = (Mat_SeqAIJ*)A->data;
2421     cudaError_t                err;
2422     PetscSplitCSRDataStructure h_mat;
2423     ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr);
2424     err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
2425     if (h_mat.seq) {
2426       if (a->compressedrow.use) {
2427  	err = cudaFree(h_mat.diag.i);CHKERRCUDA(err);
2428       }
2429       err = cudaFree(d_mat);CHKERRCUDA(err);
2430     }
2431   }
2432   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
2433   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
2434   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
2435   ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
2436   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
2437   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
2438   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
2439   PetscFunctionReturn(0);
2440 }
2441 
2442 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
2443 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
2444 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
2445 {
2446   PetscErrorCode ierr;
2447 
2448   PetscFunctionBegin;
2449   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
2450   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
2451   PetscFunctionReturn(0);
2452 }
2453 
2454 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
2455 {
2456   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
2457   PetscErrorCode ierr;
2458 
2459   PetscFunctionBegin;
2460   if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0);
2461   if (flg) {
2462     ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
2463 
2464     A->ops->mult                      = MatMult_SeqAIJ;
2465     A->ops->multadd                   = MatMultAdd_SeqAIJ;
2466     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
2467     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
2468     A->ops->multhermitiantranspose    = NULL;
2469     A->ops->multhermitiantransposeadd = NULL;
2470     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
2471     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
2472     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
2473     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
2474     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr);
2475   } else {
2476     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
2477     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
2478     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
2479     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
2480     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
2481     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
2482     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
2483     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
2484     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
2485     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
2486     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);CHKERRQ(ierr);
2487   }
2488   A->boundtocpu = flg;
2489   a->inode.use = flg;
2490   PetscFunctionReturn(0);
2491 }
2492 
2493 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
2494 {
2495   PetscSplitCSRDataStructure *d_mat = NULL;
2496   PetscErrorCode             ierr;
2497   PetscBool                  both = PETSC_FALSE;
2498 
2499   PetscFunctionBegin;
2500   if (A->factortype == MAT_FACTOR_NONE) {
2501     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
2502     if (spptr->mat) {
2503       CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat;
2504       if (matrix->values) {
2505         both = PETSC_TRUE;
2506         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
2507       }
2508     }
2509     if (spptr->matTranspose) {
2510       CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat;
2511       if (matrix->values) {
2512         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
2513       }
2514     }
2515     d_mat = spptr->deviceMat;
2516   }
2517   if (d_mat) {
2518     Mat_SeqAIJ   *a = (Mat_SeqAIJ*)A->data;
2519     PetscInt     n = A->rmap->n, nnz = a->i[n];
2520     cudaError_t  err;
2521     PetscScalar  *vals;
2522     ierr = PetscInfo(A,"Zero device matrix\n");CHKERRQ(ierr);
2523     err = cudaMemcpy( &vals, &d_mat->diag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
2524     err = cudaMemset( vals, 0, (nnz)*sizeof(PetscScalar));CHKERRCUDA(err);
2525   }
2526   ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr);
2527   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
2528 
2529   PetscFunctionReturn(0);
2530 }
2531 
2532 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
2533 {
2534   PetscErrorCode   ierr;
2535   cusparseStatus_t stat;
2536   Mat              B;
2537 
2538   PetscFunctionBegin;
2539   if (reuse == MAT_INITIAL_MATRIX) {
2540     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
2541   } else if (reuse == MAT_REUSE_MATRIX) {
2542     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
2543   }
2544   B = *newmat;
2545 
2546   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
2547   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
2548 
2549   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
2550     if (B->factortype == MAT_FACTOR_NONE) {
2551       Mat_SeqAIJCUSPARSE *spptr;
2552 
2553       ierr = PetscNew(&spptr);CHKERRQ(ierr);
2554       spptr->format = MAT_CUSPARSE_CSR;
2555       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
2556       B->spptr = spptr;
2557       spptr->deviceMat = NULL;
2558     } else {
2559       Mat_SeqAIJCUSPARSETriFactors *spptr;
2560 
2561       ierr = PetscNew(&spptr);CHKERRQ(ierr);
2562       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
2563       B->spptr = spptr;
2564     }
2565     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
2566   }
2567   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
2568   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
2569   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
2570   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
2571   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
2572   B->ops->zeroentries    = MatZeroEntries_SeqAIJCUSPARSE;
2573 
2574   ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr);
2575   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2576   ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
2577   PetscFunctionReturn(0);
2578 }
2579 
2580 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
2581 {
2582   PetscErrorCode ierr;
2583 
2584   PetscFunctionBegin;
2585   ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr);
2586   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
2587   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
2588   ierr = PetscObjectOptionsBegin((PetscObject)B);CHKERRQ(ierr);
2589   ierr = MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionsObject,B);CHKERRQ(ierr);
2590   ierr = PetscOptionsEnd();CHKERRQ(ierr);
2591   PetscFunctionReturn(0);
2592 }
2593 
2594 /*MC
2595    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
2596 
2597    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
2598    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
2599    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
2600 
2601    Options Database Keys:
2602 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
2603 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
2604 -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
2605 
2606   Level: beginner
2607 
2608 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
2609 M*/
2610 
2611 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);
2612 
2613 
2614 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
2615 {
2616   PetscErrorCode ierr;
2617 
2618   PetscFunctionBegin;
2619   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
2620   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
2621   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
2622   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
2623   PetscFunctionReturn(0);
2624 }
2625 
2626 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
2627 {
2628   PetscErrorCode   ierr;
2629   cusparseStatus_t stat;
2630 
2631   PetscFunctionBegin;
2632   if (*cusparsestruct) {
2633     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr);
2634     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr);
2635     delete (*cusparsestruct)->workVector;
2636     delete (*cusparsestruct)->rowoffsets_gpu;
2637     delete (*cusparsestruct)->cooPerm;
2638     delete (*cusparsestruct)->cooPerm_a;
2639     delete (*cusparsestruct)->cooPerm_v;
2640     delete (*cusparsestruct)->cooPerm_w;
2641     if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);}
2642    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2643     cudaError_t cerr = cudaFree((*cusparsestruct)->csr2cscBuffer);CHKERRCUDA(cerr);
2644    #endif
2645     ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr);
2646   }
2647   PetscFunctionReturn(0);
2648 }
2649 
2650 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
2651 {
2652   PetscFunctionBegin;
2653   if (*mat) {
2654     delete (*mat)->values;
2655     delete (*mat)->column_indices;
2656     delete (*mat)->row_offsets;
2657     delete *mat;
2658     *mat = 0;
2659   }
2660   PetscFunctionReturn(0);
2661 }
2662 
2663 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
2664 {
2665   cusparseStatus_t stat;
2666   PetscErrorCode   ierr;
2667 
2668   PetscFunctionBegin;
2669   if (*trifactor) {
2670     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
2671     if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
2672     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
2673     if ((*trifactor)->solveBuffer)   {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);}
2674    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2675     if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);}
2676    #endif
2677     delete *trifactor;
2678     *trifactor = NULL;
2679   }
2680   PetscFunctionReturn(0);
2681 }
2682 
2683 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
2684 {
2685   CsrMatrix        *mat;
2686   cusparseStatus_t stat;
2687   cudaError_t      err;
2688 
2689   PetscFunctionBegin;
2690   if (*matstruct) {
2691     if ((*matstruct)->mat) {
2692       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
2693        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2694         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2695        #else
2696         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
2697         stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
2698        #endif
2699       } else {
2700         mat = (CsrMatrix*)(*matstruct)->mat;
2701         CsrMatrix_Destroy(&mat);
2702       }
2703     }
2704     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
2705     delete (*matstruct)->cprowIndices;
2706     if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); }
2707     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
2708     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
2709 
2710    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2711     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
2712     if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);}
2713     for (int i=0; i<3; i++) {
2714       if (mdata->cuSpMV[i].initialized) {
2715         err  = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err);
2716         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat);
2717         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat);
2718       }
2719     }
2720    #endif
2721     delete *matstruct;
2722     *matstruct = NULL;
2723   }
2724   PetscFunctionReturn(0);
2725 }
2726 
2727 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors)
2728 {
2729   PetscErrorCode ierr;
2730 
2731   PetscFunctionBegin;
2732   if (*trifactors) {
2733     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr);
2734     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr);
2735     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr);
2736     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr);
2737     delete (*trifactors)->rpermIndices;
2738     delete (*trifactors)->cpermIndices;
2739     delete (*trifactors)->workVector;
2740     (*trifactors)->rpermIndices = NULL;
2741     (*trifactors)->cpermIndices = NULL;
2742     (*trifactors)->workVector = NULL;
2743   }
2744   PetscFunctionReturn(0);
2745 }
2746 
2747 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
2748 {
2749   PetscErrorCode   ierr;
2750   cusparseHandle_t handle;
2751   cusparseStatus_t stat;
2752 
2753   PetscFunctionBegin;
2754   if (*trifactors) {
2755     ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr);
2756     if (handle = (*trifactors)->handle) {
2757       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
2758     }
2759     ierr = PetscFree(*trifactors);CHKERRQ(ierr);
2760   }
2761   PetscFunctionReturn(0);
2762 }
2763 
2764 struct IJCompare
2765 {
2766   __host__ __device__
2767   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
2768   {
2769     if (t1.get<0>() < t2.get<0>()) return true;
2770     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
2771     return false;
2772   }
2773 };
2774 
2775 struct IJEqual
2776 {
2777   __host__ __device__
2778   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
2779   {
2780     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
2781     return true;
2782   }
2783 };
2784 
2785 struct IJDiff
2786 {
2787   __host__ __device__
2788   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
2789   {
2790     return t1 == t2 ? 0 : 1;
2791   }
2792 };
2793 
2794 struct IJSum
2795 {
2796   __host__ __device__
2797   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
2798   {
2799     return t1||t2;
2800   }
2801 };
2802 
2803 #include <thrust/iterator/discard_iterator.h>
2804 PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
2805 {
2806   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2807   CsrMatrix          *matrix;
2808   PetscErrorCode     ierr;
2809   cudaError_t        cerr;
2810   PetscInt           n;
2811 
2812   PetscFunctionBegin;
2813   if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct");
2814   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix");
2815   if (!cusp->cooPerm) {
2816     ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2817     ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2818     PetscFunctionReturn(0);
2819   }
2820   n = cusp->cooPerm->size();
2821   matrix = (CsrMatrix*)cusp->mat->mat;
2822   if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
2823   if (!cusp->cooPerm_v) { cusp->cooPerm_v = new THRUSTARRAY(n); }
2824   ierr = PetscLogEventBegin(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr);
2825   if (v) {
2826     cusp->cooPerm_v->assign(v,v+n);
2827     ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
2828   }
2829   else thrust::fill(thrust::device,cusp->cooPerm_v->begin(),cusp->cooPerm_v->end(),0.);
2830   if (imode == ADD_VALUES) {
2831     if (cusp->cooPerm_a) {
2832       if (!cusp->cooPerm_w) cusp->cooPerm_w = new THRUSTARRAY(matrix->values->size());
2833       auto vbit = thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->begin());
2834       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cusp->cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
2835       thrust::transform(cusp->cooPerm_w->begin(),cusp->cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>());
2836     } else {
2837       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->begin()),
2838                                                                 matrix->values->begin()));
2839       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->end()),
2840                                                                 matrix->values->end()));
2841       thrust::for_each(zibit,zieit,VecCUDAPlusEquals());
2842     }
2843   } else {
2844     if (cusp->cooPerm_a) { /* non unique values insertion, result is undefined (we cannot guarantee last takes precedence)
2845                               if we are inserting two different values into the same location */
2846       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->begin()),
2847                                                                 thrust::make_permutation_iterator(matrix->values->begin(),cusp->cooPerm_a->begin())));
2848       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->end()),
2849                                                                 thrust::make_permutation_iterator(matrix->values->begin(),cusp->cooPerm_a->end())));
2850       thrust::for_each(zibit,zieit,VecCUDAEquals());
2851     } else {
2852       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->begin()),
2853                                                                 matrix->values->begin()));
2854       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->end()),
2855                                                                 matrix->values->end()));
2856       thrust::for_each(zibit,zieit,VecCUDAEquals());
2857     }
2858   }
2859   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2860   ierr = PetscLogEventEnd(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr);
2861   A->offloadmask = PETSC_OFFLOAD_GPU;
2862   ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2863   ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2864   /* we can remove this call when MatSeqAIJGetArray operations are used everywhere! */
2865   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
2866   PetscFunctionReturn(0);
2867 }
2868 
2869 #include <thrust/binary_search.h>
2870 PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
2871 {
2872   PetscErrorCode     ierr;
2873   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2874   CsrMatrix          *matrix;
2875   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
2876   PetscInt           cooPerm_n, nzr = 0;
2877   cudaError_t        cerr;
2878 
2879   PetscFunctionBegin;
2880   ierr = PetscLogEventBegin(MAT_CUSPARSEPreallCOO,A,0,0,0);CHKERRQ(ierr);
2881   ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr);
2882   ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr);
2883   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
2884   if (n != cooPerm_n) {
2885     delete cusp->cooPerm;
2886     delete cusp->cooPerm_v;
2887     delete cusp->cooPerm_w;
2888     delete cusp->cooPerm_a;
2889     cusp->cooPerm = NULL;
2890     cusp->cooPerm_v = NULL;
2891     cusp->cooPerm_w = NULL;
2892     cusp->cooPerm_a = NULL;
2893   }
2894   if (n) {
2895     THRUSTINTARRAY d_i(n);
2896     THRUSTINTARRAY d_j(n);
2897     THRUSTINTARRAY ii(A->rmap->n);
2898 
2899     if (!cusp->cooPerm)   { cusp->cooPerm   = new THRUSTINTARRAY(n); }
2900     if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); }
2901 
2902     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
2903     d_i.assign(coo_i,coo_i+n);
2904     d_j.assign(coo_j,coo_j+n);
2905     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin()));
2906     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end()));
2907 
2908     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
2909     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare());
2910     *cusp->cooPerm_a = d_i;
2911     THRUSTINTARRAY w = d_j;
2912 
2913     auto nekey = thrust::unique(fkey, ekey, IJEqual());
2914     if (nekey == ekey) { /* all entries are unique */
2915       delete cusp->cooPerm_a;
2916       cusp->cooPerm_a = NULL;
2917     } else { /* I couldn't come up with a more elegant algorithm */
2918       adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff());
2919       adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff());
2920       (*cusp->cooPerm_a)[0] = 0;
2921       w[0] = 0;
2922       thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum());
2923       thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>());
2924     }
2925     thrust::counting_iterator<PetscInt> search_begin(0);
2926     thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(),
2927                         search_begin, search_begin + A->rmap->n,
2928                         ii.begin());
2929 
2930     ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr);
2931     a->singlemalloc = PETSC_FALSE;
2932     a->free_a       = PETSC_TRUE;
2933     a->free_ij      = PETSC_TRUE;
2934     ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr);
2935     a->i[0] = 0;
2936     cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2937     a->nz = a->maxnz = a->i[A->rmap->n];
2938     ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr);
2939     ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr);
2940     cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2941     if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); }
2942     if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); }
2943     for (PetscInt i = 0; i < A->rmap->n; i++) {
2944       const PetscInt nnzr = a->i[i+1] - a->i[i];
2945       nzr += (PetscInt)!!(nnzr);
2946       a->ilen[i] = a->imax[i] = nnzr;
2947     }
2948     A->preallocated = PETSC_TRUE;
2949     ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr);
2950   } else {
2951     ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr);
2952   }
2953   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2954   ierr = PetscLogEventEnd(MAT_CUSPARSEPreallCOO,A,0,0,0);CHKERRQ(ierr);
2955 
2956   /* We want to allocate the CUSPARSE struct for matvec now.
2957      The code is so convoluted now that I prefer to copy garbage to the GPU */
2958   ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr);
2959   A->offloadmask = PETSC_OFFLOAD_CPU;
2960   A->nonzerostate++;
2961   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2962   {
2963     matrix = (CsrMatrix*)cusp->mat->mat;
2964     if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
2965     thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
2966     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr);
2967   }
2968 
2969   A->offloadmask = PETSC_OFFLOAD_CPU;
2970   A->assembled = PETSC_FALSE;
2971   A->was_assembled = PETSC_FALSE;
2972   PetscFunctionReturn(0);
2973 }
2974