xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 7e8381f984f038c2461ffb95eefd9a80face03cb)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 #define PETSC_SKIP_CXX_COMPLEX_FIX
7 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
8 
9 #include <petscconf.h>
10 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
11 #include <../src/mat/impls/sbaij/seq/sbaij.h>
12 #include <../src/vec/vec/impls/dvecimpl.h>
13 #include <petsc/private/vecimpl.h>
14 #undef VecType
15 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
16 
17 const char *const MatCUSPARSEStorageFormats[]    = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
18 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
19   /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
20     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
21 
22   typedef enum {
23       CUSPARSE_MV_ALG_DEFAULT = 0,
24       CUSPARSE_COOMV_ALG      = 1,
25       CUSPARSE_CSRMV_ALG1     = 2,
26       CUSPARSE_CSRMV_ALG2     = 3
27   } cusparseSpMVAlg_t;
28 
29   typedef enum {
30       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
31       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
32       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
33       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
34       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
35       CUSPARSE_SPMM_ALG_DEFAULT = 0,
36       CUSPARSE_SPMM_COO_ALG1    = 1,
37       CUSPARSE_SPMM_COO_ALG2    = 2,
38       CUSPARSE_SPMM_COO_ALG3    = 3,
39       CUSPARSE_SPMM_COO_ALG4    = 5,
40       CUSPARSE_SPMM_CSR_ALG1    = 4,
41       CUSPARSE_SPMM_CSR_ALG2    = 6,
42   } cusparseSpMMAlg_t;
43 
44   typedef enum {
45       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
46       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
47   } cusparseCsr2CscAlg_t;
48   */
49   const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
50   const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
51   const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
52 #endif
53 
54 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
55 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
56 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
57 
58 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
59 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
60 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
61 
62 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
63 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
64 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
65 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
66 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
67 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
68 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
69 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
70 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
71 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
72 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
73 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
74 
75 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
76 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
77 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
78 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**);
79 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
80 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
81 
82 PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]);
83 PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode);
84 
85 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
86 {
87   cusparseStatus_t   stat;
88   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
89 
90   PetscFunctionBegin;
91   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
92   cusparsestruct->stream = stream;
93   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
94   PetscFunctionReturn(0);
95 }
96 
97 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
98 {
99   cusparseStatus_t   stat;
100   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
101 
102   PetscFunctionBegin;
103   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
104   if (cusparsestruct->handle != handle) {
105     if (cusparsestruct->handle) {
106       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
107     }
108     cusparsestruct->handle = handle;
109   }
110   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
111   PetscFunctionReturn(0);
112 }
113 
114 PetscErrorCode MatCUSPARSEClearHandle(Mat A)
115 {
116   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
117   PetscBool          flg;
118   PetscErrorCode     ierr;
119 
120   PetscFunctionBegin;
121   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
122   if (!flg || !cusparsestruct) PetscFunctionReturn(0);
123   if (cusparsestruct->handle) cusparsestruct->handle = 0;
124   PetscFunctionReturn(0);
125 }
126 
127 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
128 {
129   PetscFunctionBegin;
130   *type = MATSOLVERCUSPARSE;
131   PetscFunctionReturn(0);
132 }
133 
134 /*MC
135   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
136   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
137   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
138   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
139   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
140   algorithms are not recommended. This class does NOT support direct solver operations.
141 
142   Level: beginner
143 
144 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
145 M*/
146 
147 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
148 {
149   PetscErrorCode ierr;
150   PetscInt       n = A->rmap->n;
151 
152   PetscFunctionBegin;
153   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
154   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
155   (*B)->factortype = ftype;
156   (*B)->useordering = PETSC_TRUE;
157   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
158 
159   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
160     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
161     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
162     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
163   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
164     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
165     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
166   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
167 
168   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
169   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
170   PetscFunctionReturn(0);
171 }
172 
173 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
174 {
175   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
176 
177   PetscFunctionBegin;
178   switch (op) {
179   case MAT_CUSPARSE_MULT:
180     cusparsestruct->format = format;
181     break;
182   case MAT_CUSPARSE_ALL:
183     cusparsestruct->format = format;
184     break;
185   default:
186     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
187   }
188   PetscFunctionReturn(0);
189 }
190 
191 /*@
192    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
193    operation. Only the MatMult operation can use different GPU storage formats
194    for MPIAIJCUSPARSE matrices.
195    Not Collective
196 
197    Input Parameters:
198 +  A - Matrix of type SEQAIJCUSPARSE
199 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
200 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
201 
202    Output Parameter:
203 
204    Level: intermediate
205 
206 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
207 @*/
208 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
209 {
210   PetscErrorCode ierr;
211 
212   PetscFunctionBegin;
213   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
214   ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
215   PetscFunctionReturn(0);
216 }
217 
218 /*@
219    MatSeqAIJCUSPARSESetGenerateTranspose - Sets the flag to explicitly generate the tranpose matrix before calling MatMultTranspose
220 
221    Collective on mat
222 
223    Input Parameters:
224 +  A - Matrix of type SEQAIJCUSPARSE
225 -  transgen - the boolean flag
226 
227    Level: intermediate
228 
229 .seealso: MATSEQAIJCUSPARSE
230 @*/
231 PetscErrorCode MatSeqAIJCUSPARSESetGenerateTranspose(Mat A,PetscBool transgen)
232 {
233   PetscErrorCode ierr;
234   PetscBool      flg;
235 
236   PetscFunctionBegin;
237   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
238   ierr = PetscObjectTypeCompare(((PetscObject)A),MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
239   if (flg) {
240     Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
241 
242     if (A->factortype) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
243     cusp->transgen = transgen;
244     if (!transgen) { /* need to destroy the transpose matrix if present to prevent from logic errors if transgen is set to true later */
245       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr);
246     }
247   }
248   PetscFunctionReturn(0);
249 }
250 
251 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
252 {
253   PetscErrorCode           ierr;
254   MatCUSPARSEStorageFormat format;
255   PetscBool                flg;
256   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
257 
258   PetscFunctionBegin;
259   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
260   if (A->factortype == MAT_FACTOR_NONE) {
261     PetscBool transgen = cusparsestruct->transgen;
262 
263     ierr = PetscOptionsBool("-mat_cusparse_transgen","Generate explicit transpose for MatMultTranspose","MatSeqAIJCUSPARSESetGenerateTranspose",transgen,&transgen,&flg);CHKERRQ(ierr);
264     if (flg) {ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A,transgen);CHKERRQ(ierr);}
265 
266     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
267                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
268     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);}
269 
270     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
271                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
272     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);}
273    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
274     cusparsestruct->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */
275     ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
276                             "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr);
277     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
278     if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
279 
280     cusparsestruct->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
281     ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
282                             "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr);
283     if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
284 
285     cusparsestruct->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
286     ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
287                             "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr);
288     if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
289    #endif
290   }
291   ierr = PetscOptionsTail();CHKERRQ(ierr);
292   PetscFunctionReturn(0);
293 }
294 
295 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
296 {
297   PetscErrorCode ierr;
298 
299   PetscFunctionBegin;
300   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
301   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
302   PetscFunctionReturn(0);
303 }
304 
305 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
306 {
307   PetscErrorCode ierr;
308 
309   PetscFunctionBegin;
310   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
311   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
312   PetscFunctionReturn(0);
313 }
314 
315 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
316 {
317   PetscErrorCode ierr;
318 
319   PetscFunctionBegin;
320   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
321   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
322   PetscFunctionReturn(0);
323 }
324 
325 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
326 {
327   PetscErrorCode ierr;
328 
329   PetscFunctionBegin;
330   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
331   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
332   PetscFunctionReturn(0);
333 }
334 
335 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
336 {
337   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
338   PetscInt                          n = A->rmap->n;
339   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
340   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
341   cusparseStatus_t                  stat;
342   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
343   const MatScalar                   *aa = a->a,*v;
344   PetscInt                          *AiLo, *AjLo;
345   PetscScalar                       *AALo;
346   PetscInt                          i,nz, nzLower, offset, rowOffset;
347   PetscErrorCode                    ierr;
348   cudaError_t                       cerr;
349 
350   PetscFunctionBegin;
351   if (!n) PetscFunctionReturn(0);
352   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
353     try {
354       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
355       nzLower=n+ai[n]-ai[1];
356 
357       /* Allocate Space for the lower triangular matrix */
358       cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
359       cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);
360       cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
361 
362       /* Fill the lower triangular matrix */
363       AiLo[0]  = (PetscInt) 0;
364       AiLo[n]  = nzLower;
365       AjLo[0]  = (PetscInt) 0;
366       AALo[0]  = (MatScalar) 1.0;
367       v        = aa;
368       vi       = aj;
369       offset   = 1;
370       rowOffset= 1;
371       for (i=1; i<n; i++) {
372         nz = ai[i+1] - ai[i];
373         /* additional 1 for the term on the diagonal */
374         AiLo[i]    = rowOffset;
375         rowOffset += nz+1;
376 
377         ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
378         ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
379 
380         offset      += nz;
381         AjLo[offset] = (PetscInt) i;
382         AALo[offset] = (MatScalar) 1.0;
383         offset      += 1;
384 
385         v  += nz;
386         vi += nz;
387       }
388 
389       /* allocate space for the triangular factor information */
390       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
391 
392       /* Create the matrix description */
393       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
394       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
395      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
396       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
397      #else
398       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
399      #endif
400       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
401       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
402 
403       /* set the operation */
404       loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
405 
406       /* set the matrix */
407       loTriFactor->csrMat = new CsrMatrix;
408       loTriFactor->csrMat->num_rows = n;
409       loTriFactor->csrMat->num_cols = n;
410       loTriFactor->csrMat->num_entries = nzLower;
411 
412       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
413       loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
414 
415       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
416       loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
417 
418       loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
419       loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
420 
421       /* Create the solve analysis information */
422       stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
423     #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
424       stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
425                                      loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
426                                      loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
427                                      loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
428                                      &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
429       cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
430     #endif
431 
432       /* perform the solve analysis */
433       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
434                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
435                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
436                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
437                              #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
438                                ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
439                              #endif
440                               );CHKERRCUSPARSE(stat);
441 
442       /* assign the pointer. Is this really necessary? */
443       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
444 
445       cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
446       cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
447       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
448       ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
449     } catch(char *ex) {
450       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
451     }
452   }
453   PetscFunctionReturn(0);
454 }
455 
456 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
457 {
458   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
459   PetscInt                          n = A->rmap->n;
460   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
461   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
462   cusparseStatus_t                  stat;
463   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
464   const MatScalar                   *aa = a->a,*v;
465   PetscInt                          *AiUp, *AjUp;
466   PetscScalar                       *AAUp;
467   PetscInt                          i,nz, nzUpper, offset;
468   PetscErrorCode                    ierr;
469   cudaError_t                       cerr;
470 
471   PetscFunctionBegin;
472   if (!n) PetscFunctionReturn(0);
473   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
474     try {
475       /* next, figure out the number of nonzeros in the upper triangular matrix. */
476       nzUpper = adiag[0]-adiag[n];
477 
478       /* Allocate Space for the upper triangular matrix */
479       cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
480       cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
481       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
482 
483       /* Fill the upper triangular matrix */
484       AiUp[0]=(PetscInt) 0;
485       AiUp[n]=nzUpper;
486       offset = nzUpper;
487       for (i=n-1; i>=0; i--) {
488         v  = aa + adiag[i+1] + 1;
489         vi = aj + adiag[i+1] + 1;
490 
491         /* number of elements NOT on the diagonal */
492         nz = adiag[i] - adiag[i+1]-1;
493 
494         /* decrement the offset */
495         offset -= (nz+1);
496 
497         /* first, set the diagonal elements */
498         AjUp[offset] = (PetscInt) i;
499         AAUp[offset] = (MatScalar)1./v[nz];
500         AiUp[i]      = AiUp[i+1] - (nz+1);
501 
502         ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
503         ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
504       }
505 
506       /* allocate space for the triangular factor information */
507       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
508 
509       /* Create the matrix description */
510       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
511       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
512      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
513       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
514      #else
515       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
516      #endif
517       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
518       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
519 
520       /* set the operation */
521       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
522 
523       /* set the matrix */
524       upTriFactor->csrMat = new CsrMatrix;
525       upTriFactor->csrMat->num_rows = n;
526       upTriFactor->csrMat->num_cols = n;
527       upTriFactor->csrMat->num_entries = nzUpper;
528 
529       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
530       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
531 
532       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
533       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
534 
535       upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
536       upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
537 
538       /* Create the solve analysis information */
539       stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
540     #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
541       stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
542                                    upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
543                                    upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
544                                    upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
545                                    &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
546       cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
547     #endif
548 
549       /* perform the solve analysis */
550       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
551                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
552                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
553                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
554                              #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
555                                ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
556                              #endif
557                               );CHKERRCUSPARSE(stat);
558 
559       /* assign the pointer. Is this really necessary? */
560       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
561 
562       cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
563       cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
564       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
565       ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
566     } catch(char *ex) {
567       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
568     }
569   }
570   PetscFunctionReturn(0);
571 }
572 
573 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
574 {
575   PetscErrorCode               ierr;
576   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
577   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
578   IS                           isrow = a->row,iscol = a->icol;
579   PetscBool                    row_identity,col_identity;
580   const PetscInt               *r,*c;
581   PetscInt                     n = A->rmap->n;
582 
583   PetscFunctionBegin;
584   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
585   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
586   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
587 
588   cusparseTriFactors->workVector = new THRUSTARRAY(n);
589   cusparseTriFactors->nnz=a->nz;
590 
591   A->offloadmask = PETSC_OFFLOAD_BOTH;
592   /* lower triangular indices */
593   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
594   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
595   if (!row_identity) {
596     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
597     cusparseTriFactors->rpermIndices->assign(r, r+n);
598   }
599   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
600 
601   /* upper triangular indices */
602   ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
603   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
604   if (!col_identity) {
605     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
606     cusparseTriFactors->cpermIndices->assign(c, c+n);
607   }
608 
609   if (!row_identity && !col_identity) {
610     ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr);
611   } else if(!row_identity) {
612     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
613   } else if(!col_identity) {
614     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
615   }
616 
617   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
618   PetscFunctionReturn(0);
619 }
620 
621 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
622 {
623   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
624   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
625   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
626   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
627   cusparseStatus_t                  stat;
628   PetscErrorCode                    ierr;
629   cudaError_t                       cerr;
630   PetscInt                          *AiUp, *AjUp;
631   PetscScalar                       *AAUp;
632   PetscScalar                       *AALo;
633   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
634   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
635   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
636   const MatScalar                   *aa = b->a,*v;
637 
638   PetscFunctionBegin;
639   if (!n) PetscFunctionReturn(0);
640   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
641     try {
642       /* Allocate Space for the upper triangular matrix */
643       cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
644       cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
645       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
646       cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
647 
648       /* Fill the upper triangular matrix */
649       AiUp[0]=(PetscInt) 0;
650       AiUp[n]=nzUpper;
651       offset = 0;
652       for (i=0; i<n; i++) {
653         /* set the pointers */
654         v  = aa + ai[i];
655         vj = aj + ai[i];
656         nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
657 
658         /* first, set the diagonal elements */
659         AjUp[offset] = (PetscInt) i;
660         AAUp[offset] = (MatScalar)1.0/v[nz];
661         AiUp[i]      = offset;
662         AALo[offset] = (MatScalar)1.0/v[nz];
663 
664         offset+=1;
665         if (nz>0) {
666           ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
667           ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
668           for (j=offset; j<offset+nz; j++) {
669             AAUp[j] = -AAUp[j];
670             AALo[j] = AAUp[j]/v[nz];
671           }
672           offset+=nz;
673         }
674       }
675 
676       /* allocate space for the triangular factor information */
677       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
678 
679       /* Create the matrix description */
680       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
681       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
682      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
683       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
684      #else
685       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
686      #endif
687       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
688       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
689 
690       /* set the matrix */
691       upTriFactor->csrMat = new CsrMatrix;
692       upTriFactor->csrMat->num_rows = A->rmap->n;
693       upTriFactor->csrMat->num_cols = A->cmap->n;
694       upTriFactor->csrMat->num_entries = a->nz;
695 
696       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
697       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
698 
699       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
700       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
701 
702       upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
703       upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
704 
705       /* set the operation */
706       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
707 
708       /* Create the solve analysis information */
709       stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
710     #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
711       stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
712                                      upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
713                                      upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
714                                      upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
715                                      &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
716       cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
717     #endif
718 
719       /* perform the solve analysis */
720       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
721                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
722                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
723                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
724                               #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
725                                ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
726                               #endif
727                               );CHKERRCUSPARSE(stat);
728 
729       /* assign the pointer. Is this really necessary? */
730       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
731 
732       /* allocate space for the triangular factor information */
733       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
734 
735       /* Create the matrix description */
736       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
737       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
738      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
739       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
740      #else
741       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
742      #endif
743       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
744       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
745 
746       /* set the operation */
747       loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
748 
749       /* set the matrix */
750       loTriFactor->csrMat = new CsrMatrix;
751       loTriFactor->csrMat->num_rows = A->rmap->n;
752       loTriFactor->csrMat->num_cols = A->cmap->n;
753       loTriFactor->csrMat->num_entries = a->nz;
754 
755       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
756       loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
757 
758       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
759       loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
760 
761       loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
762       loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
763       ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
764 
765       /* Create the solve analysis information */
766       stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
767     #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
768       stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
769                                      loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
770                                      loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
771                                      loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
772                                      &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
773       cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
774     #endif
775 
776       /* perform the solve analysis */
777       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
778                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
779                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
780                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
781                               #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
782                                ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
783                               #endif
784                               );CHKERRCUSPARSE(stat);
785 
786       /* assign the pointer. Is this really necessary? */
787       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
788 
789       A->offloadmask = PETSC_OFFLOAD_BOTH;
790       cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
791       cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
792       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
793       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
794     } catch(char *ex) {
795       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
796     }
797   }
798   PetscFunctionReturn(0);
799 }
800 
801 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
802 {
803   PetscErrorCode               ierr;
804   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
805   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
806   IS                           ip = a->row;
807   const PetscInt               *rip;
808   PetscBool                    perm_identity;
809   PetscInt                     n = A->rmap->n;
810 
811   PetscFunctionBegin;
812   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
813   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
814   cusparseTriFactors->workVector = new THRUSTARRAY(n);
815   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
816 
817   /* lower triangular indices */
818   ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
819   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
820   if (!perm_identity) {
821     IS             iip;
822     const PetscInt *irip;
823 
824     ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr);
825     ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr);
826     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
827     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
828     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
829     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
830     ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr);
831     ierr = ISDestroy(&iip);CHKERRQ(ierr);
832     ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr);
833   }
834   ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
835   PetscFunctionReturn(0);
836 }
837 
838 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
839 {
840   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
841   IS             isrow = b->row,iscol = b->col;
842   PetscBool      row_identity,col_identity;
843   PetscErrorCode ierr;
844 
845   PetscFunctionBegin;
846   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
847   B->offloadmask = PETSC_OFFLOAD_CPU;
848   /* determine which version of MatSolve needs to be used. */
849   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
850   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
851   if (row_identity && col_identity) {
852     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
853     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
854     B->ops->matsolve = NULL;
855     B->ops->matsolvetranspose = NULL;
856   } else {
857     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
858     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
859     B->ops->matsolve = NULL;
860     B->ops->matsolvetranspose = NULL;
861   }
862 
863   /* get the triangular factors */
864   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
865   PetscFunctionReturn(0);
866 }
867 
868 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
869 {
870   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
871   IS             ip = b->row;
872   PetscBool      perm_identity;
873   PetscErrorCode ierr;
874 
875   PetscFunctionBegin;
876   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
877   B->offloadmask = PETSC_OFFLOAD_CPU;
878   /* determine which version of MatSolve needs to be used. */
879   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
880   if (perm_identity) {
881     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
882     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
883     B->ops->matsolve = NULL;
884     B->ops->matsolvetranspose = NULL;
885   } else {
886     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
887     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
888     B->ops->matsolve = NULL;
889     B->ops->matsolvetranspose = NULL;
890   }
891 
892   /* get the triangular factors */
893   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
894   PetscFunctionReturn(0);
895 }
896 
897 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
898 {
899   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
900   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
901   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
902   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
903   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
904   cusparseStatus_t                  stat;
905   cusparseIndexBase_t               indexBase;
906   cusparseMatrixType_t              matrixType;
907   cusparseFillMode_t                fillMode;
908   cusparseDiagType_t                diagType;
909 
910   PetscFunctionBegin;
911 
912   /*********************************************/
913   /* Now the Transpose of the Lower Tri Factor */
914   /*********************************************/
915 
916   /* allocate space for the transpose of the lower triangular factor */
917   loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
918 
919   /* set the matrix descriptors of the lower triangular factor */
920   matrixType = cusparseGetMatType(loTriFactor->descr);
921   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
922   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
923     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
924   diagType = cusparseGetMatDiagType(loTriFactor->descr);
925 
926   /* Create the matrix description */
927   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
928   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
929   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
930   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
931   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
932 
933   /* set the operation */
934   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
935 
936   /* allocate GPU space for the CSC of the lower triangular factor*/
937   loTriFactorT->csrMat = new CsrMatrix;
938   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
939   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
940   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
941   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
942   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
943   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
944 
945   /* compute the transpose of the lower triangular factor, i.e. the CSC */
946 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
947   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
948                                        loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
949                                        loTriFactor->csrMat->values->data().get(),
950                                        loTriFactor->csrMat->row_offsets->data().get(),
951                                        loTriFactor->csrMat->column_indices->data().get(),
952                                        loTriFactorT->csrMat->values->data().get(),
953                                        loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
954                                        CUSPARSE_ACTION_NUMERIC,indexBase,
955                                        CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
956   cudaError_t cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
957 #endif
958 
959   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
960                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
961                           loTriFactor->csrMat->values->data().get(),
962                           loTriFactor->csrMat->row_offsets->data().get(),
963                           loTriFactor->csrMat->column_indices->data().get(),
964                           loTriFactorT->csrMat->values->data().get(),
965                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
966                           loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
967                           CUSPARSE_ACTION_NUMERIC, indexBase,
968                           CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer
969                         #else
970                           loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
971                           CUSPARSE_ACTION_NUMERIC, indexBase
972                         #endif
973                         );CHKERRCUSPARSE(stat);
974 
975   /* Create the solve analysis information */
976   stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
977 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
978   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
979                                 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
980                                 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
981                                 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
982                                 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
983   cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
984 #endif
985 
986   /* perform the solve analysis */
987   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
988                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
989                            loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
990                            loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo
991                           #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
992                            ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
993                           #endif
994                           );CHKERRCUSPARSE(stat);
995 
996   /* assign the pointer. Is this really necessary? */
997   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
998 
999   /*********************************************/
1000   /* Now the Transpose of the Upper Tri Factor */
1001   /*********************************************/
1002 
1003   /* allocate space for the transpose of the upper triangular factor */
1004   upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
1005 
1006   /* set the matrix descriptors of the upper triangular factor */
1007   matrixType = cusparseGetMatType(upTriFactor->descr);
1008   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1009   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1010     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1011   diagType = cusparseGetMatDiagType(upTriFactor->descr);
1012 
1013   /* Create the matrix description */
1014   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
1015   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1016   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1017   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1018   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1019 
1020   /* set the operation */
1021   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1022 
1023   /* allocate GPU space for the CSC of the upper triangular factor*/
1024   upTriFactorT->csrMat = new CsrMatrix;
1025   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1026   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1027   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1028   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1029   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1030   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1031 
1032   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1033 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1034   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1035                                 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1036                                 upTriFactor->csrMat->values->data().get(),
1037                                 upTriFactor->csrMat->row_offsets->data().get(),
1038                                 upTriFactor->csrMat->column_indices->data().get(),
1039                                 upTriFactorT->csrMat->values->data().get(),
1040                                 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1041                                 CUSPARSE_ACTION_NUMERIC,indexBase,
1042                                 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1043   cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1044 #endif
1045 
1046   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1047                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1048                           upTriFactor->csrMat->values->data().get(),
1049                           upTriFactor->csrMat->row_offsets->data().get(),
1050                           upTriFactor->csrMat->column_indices->data().get(),
1051                           upTriFactorT->csrMat->values->data().get(),
1052                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1053                           upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1054                           CUSPARSE_ACTION_NUMERIC, indexBase,
1055                           CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer
1056                         #else
1057                           upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1058                           CUSPARSE_ACTION_NUMERIC, indexBase
1059                         #endif
1060                         );CHKERRCUSPARSE(stat);
1061 
1062   /* Create the solve analysis information */
1063   stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1064   #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1065   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1066                                  upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1067                                  upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1068                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
1069                                  &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1070   cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1071   #endif
1072 
1073   /* perform the solve analysis */
1074   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1075                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1076                            upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1077                            upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo
1078                           #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1079                            ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1080                           #endif
1081                           );CHKERRCUSPARSE(stat);
1082 
1083   /* assign the pointer. Is this really necessary? */
1084   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1085   PetscFunctionReturn(0);
1086 }
1087 
1088 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
1089 {
1090   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1091   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1092   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1093   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1094   cusparseStatus_t             stat;
1095   cusparseIndexBase_t          indexBase;
1096   cudaError_t                  err;
1097   PetscErrorCode               ierr;
1098 
1099   PetscFunctionBegin;
1100   if (!cusparsestruct->transgen || cusparsestruct->matTranspose) PetscFunctionReturn(0);
1101   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1102   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1103   /* create cusparse matrix */
1104   matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1105   stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
1106   indexBase = cusparseGetMatIndexBase(matstruct->descr);
1107   stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
1108   stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1109 
1110   /* set alpha and beta */
1111   err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1112   err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1113   err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1114   err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1115   err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1116   err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1117   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1118 
1119   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1120     CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
1121     CsrMatrix *matrixT= new CsrMatrix;
1122     matrixT->num_rows = A->cmap->n;
1123     matrixT->num_cols = A->rmap->n;
1124     matrixT->num_entries = a->nz;
1125     matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1126     matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1127     matrixT->values = new THRUSTARRAY(a->nz);
1128 
1129     cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
1130     cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
1131 
1132     /* compute the transpose, i.e. the CSC */
1133    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1134     stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1135                                   A->cmap->n, matrix->num_entries,
1136                                   matrix->values->data().get(),
1137                                   cusparsestruct->rowoffsets_gpu->data().get(),
1138                                   matrix->column_indices->data().get(),
1139                                   matrixT->values->data().get(),
1140                                   matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1141                                   CUSPARSE_ACTION_NUMERIC,indexBase,
1142                                   cusparsestruct->csr2cscAlg, &cusparsestruct->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1143     err = cudaMalloc(&cusparsestruct->csr2cscBuffer,cusparsestruct->csr2cscBufferSize);CHKERRCUDA(err);
1144    #endif
1145 
1146     stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1147                             A->cmap->n, matrix->num_entries,
1148                             matrix->values->data().get(),
1149                             cusparsestruct->rowoffsets_gpu->data().get(),
1150                             matrix->column_indices->data().get(),
1151                             matrixT->values->data().get(),
1152                           #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1153                             matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1154                             CUSPARSE_ACTION_NUMERIC,indexBase,
1155                             cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer
1156                           #else
1157                             matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1158                             CUSPARSE_ACTION_NUMERIC, indexBase
1159                           #endif
1160                            );CHKERRCUSPARSE(stat);
1161     matstructT->mat = matrixT;
1162 
1163    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1164     stat = cusparseCreateCsr(&matstructT->matDescr,
1165                              matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1166                              matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1167                              matrixT->values->data().get(),
1168                              CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1169                              indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
1170    #endif
1171   } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1172    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1173     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1174    #else
1175     CsrMatrix *temp  = new CsrMatrix;
1176     CsrMatrix *tempT = new CsrMatrix;
1177     /* First convert HYB to CSR */
1178     temp->num_rows = A->rmap->n;
1179     temp->num_cols = A->cmap->n;
1180     temp->num_entries = a->nz;
1181     temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1182     temp->column_indices = new THRUSTINTARRAY32(a->nz);
1183     temp->values = new THRUSTARRAY(a->nz);
1184 
1185     stat = cusparse_hyb2csr(cusparsestruct->handle,
1186                             matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1187                             temp->values->data().get(),
1188                             temp->row_offsets->data().get(),
1189                             temp->column_indices->data().get());CHKERRCUSPARSE(stat);
1190 
1191     /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1192     tempT->num_rows = A->rmap->n;
1193     tempT->num_cols = A->cmap->n;
1194     tempT->num_entries = a->nz;
1195     tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1196     tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1197     tempT->values = new THRUSTARRAY(a->nz);
1198 
1199     stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1200                             temp->num_cols, temp->num_entries,
1201                             temp->values->data().get(),
1202                             temp->row_offsets->data().get(),
1203                             temp->column_indices->data().get(),
1204                             tempT->values->data().get(),
1205                             tempT->column_indices->data().get(),
1206                             tempT->row_offsets->data().get(),
1207                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1208 
1209     /* Last, convert CSC to HYB */
1210     cusparseHybMat_t hybMat;
1211     stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1212     cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1213       CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1214     stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1215                             matstructT->descr, tempT->values->data().get(),
1216                             tempT->row_offsets->data().get(),
1217                             tempT->column_indices->data().get(),
1218                             hybMat, 0, partition);CHKERRCUSPARSE(stat);
1219 
1220     /* assign the pointer */
1221     matstructT->mat = hybMat;
1222     /* delete temporaries */
1223     if (tempT) {
1224       if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1225       if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1226       if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1227       delete (CsrMatrix*) tempT;
1228     }
1229     if (temp) {
1230       if (temp->values) delete (THRUSTARRAY*) temp->values;
1231       if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1232       if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1233       delete (CsrMatrix*) temp;
1234     }
1235    #endif
1236   }
1237   err  = WaitForCUDA();CHKERRCUDA(err);
1238   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1239   ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1240   /* the compressed row indices is not used for matTranspose */
1241   matstructT->cprowIndices = NULL;
1242   /* assign the pointer */
1243   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1244   PetscFunctionReturn(0);
1245 }
1246 
1247 /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1248 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1249 {
1250   PetscInt                              n = xx->map->n;
1251   const PetscScalar                     *barray;
1252   PetscScalar                           *xarray;
1253   thrust::device_ptr<const PetscScalar> bGPU;
1254   thrust::device_ptr<PetscScalar>       xGPU;
1255   cusparseStatus_t                      stat;
1256   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1257   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1258   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1259   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1260   PetscErrorCode                        ierr;
1261   cudaError_t                           cerr;
1262 
1263   PetscFunctionBegin;
1264   /* Analyze the matrix and create the transpose ... on the fly */
1265   if (!loTriFactorT && !upTriFactorT) {
1266     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1267     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1268     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1269   }
1270 
1271   /* Get the GPU pointers */
1272   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1273   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1274   xGPU = thrust::device_pointer_cast(xarray);
1275   bGPU = thrust::device_pointer_cast(barray);
1276 
1277   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1278   /* First, reorder with the row permutation */
1279   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1280                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1281                xGPU);
1282 
1283   /* First, solve U */
1284   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1285                         upTriFactorT->csrMat->num_rows,
1286                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1287                         upTriFactorT->csrMat->num_entries,
1288                       #endif
1289                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1290                         upTriFactorT->csrMat->values->data().get(),
1291                         upTriFactorT->csrMat->row_offsets->data().get(),
1292                         upTriFactorT->csrMat->column_indices->data().get(),
1293                         upTriFactorT->solveInfo,
1294                         xarray, tempGPU->data().get()
1295                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1296                         ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1297                       #endif
1298                         );CHKERRCUSPARSE(stat);
1299 
1300   /* Then, solve L */
1301   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1302                         loTriFactorT->csrMat->num_rows,
1303                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1304                         loTriFactorT->csrMat->num_entries,
1305                       #endif
1306                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1307                         loTriFactorT->csrMat->values->data().get(),
1308                         loTriFactorT->csrMat->row_offsets->data().get(),
1309                         loTriFactorT->csrMat->column_indices->data().get(),
1310                         loTriFactorT->solveInfo,
1311                         tempGPU->data().get(), xarray
1312                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1313                         ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1314                       #endif
1315                         );CHKERRCUSPARSE(stat);
1316 
1317   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1318   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1319                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1320                tempGPU->begin());
1321 
1322   /* Copy the temporary to the full solution. */
1323   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1324 
1325   /* restore */
1326   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1327   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1328   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1329   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1330   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1331   PetscFunctionReturn(0);
1332 }
1333 
1334 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1335 {
1336   const PetscScalar                 *barray;
1337   PetscScalar                       *xarray;
1338   cusparseStatus_t                  stat;
1339   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1340   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1341   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1342   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1343   PetscErrorCode                    ierr;
1344   cudaError_t                       cerr;
1345 
1346   PetscFunctionBegin;
1347   /* Analyze the matrix and create the transpose ... on the fly */
1348   if (!loTriFactorT && !upTriFactorT) {
1349     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1350     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1351     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1352   }
1353 
1354   /* Get the GPU pointers */
1355   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1356   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1357 
1358   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1359   /* First, solve U */
1360   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1361                         upTriFactorT->csrMat->num_rows,
1362                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1363                         upTriFactorT->csrMat->num_entries,
1364                       #endif
1365                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1366                         upTriFactorT->csrMat->values->data().get(),
1367                         upTriFactorT->csrMat->row_offsets->data().get(),
1368                         upTriFactorT->csrMat->column_indices->data().get(),
1369                         upTriFactorT->solveInfo,
1370                         barray, tempGPU->data().get()
1371                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1372                         ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1373                       #endif
1374                         );CHKERRCUSPARSE(stat);
1375 
1376   /* Then, solve L */
1377   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1378                         loTriFactorT->csrMat->num_rows,
1379                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1380                         loTriFactorT->csrMat->num_entries,
1381                       #endif
1382                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1383                         loTriFactorT->csrMat->values->data().get(),
1384                         loTriFactorT->csrMat->row_offsets->data().get(),
1385                         loTriFactorT->csrMat->column_indices->data().get(),
1386                         loTriFactorT->solveInfo,
1387                         tempGPU->data().get(), xarray
1388                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1389                         ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1390                       #endif
1391                         );CHKERRCUSPARSE(stat);
1392 
1393   /* restore */
1394   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1395   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1396   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1397   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1398   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1399   PetscFunctionReturn(0);
1400 }
1401 
1402 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1403 {
1404   const PetscScalar                     *barray;
1405   PetscScalar                           *xarray;
1406   thrust::device_ptr<const PetscScalar> bGPU;
1407   thrust::device_ptr<PetscScalar>       xGPU;
1408   cusparseStatus_t                      stat;
1409   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1410   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1411   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1412   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1413   PetscErrorCode                        ierr;
1414   cudaError_t                           cerr;
1415 
1416   PetscFunctionBegin;
1417 
1418   /* Get the GPU pointers */
1419   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1420   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1421   xGPU = thrust::device_pointer_cast(xarray);
1422   bGPU = thrust::device_pointer_cast(barray);
1423 
1424   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1425   /* First, reorder with the row permutation */
1426   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1427                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1428                tempGPU->begin());
1429 
1430   /* Next, solve L */
1431   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1432                         loTriFactor->csrMat->num_rows,
1433                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1434                         loTriFactor->csrMat->num_entries,
1435                       #endif
1436                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1437                         loTriFactor->csrMat->values->data().get(),
1438                         loTriFactor->csrMat->row_offsets->data().get(),
1439                         loTriFactor->csrMat->column_indices->data().get(),
1440                         loTriFactor->solveInfo,
1441                         tempGPU->data().get(), xarray
1442                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1443                         ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1444                       #endif
1445                         );CHKERRCUSPARSE(stat);
1446 
1447   /* Then, solve U */
1448   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1449                         upTriFactor->csrMat->num_rows,
1450                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1451                         upTriFactor->csrMat->num_entries,
1452                       #endif
1453                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1454                         upTriFactor->csrMat->values->data().get(),
1455                         upTriFactor->csrMat->row_offsets->data().get(),
1456                         upTriFactor->csrMat->column_indices->data().get(),
1457                         upTriFactor->solveInfo,
1458                         xarray, tempGPU->data().get()
1459                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1460                         ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1461                       #endif
1462                         );CHKERRCUSPARSE(stat);
1463 
1464   /* Last, reorder with the column permutation */
1465   thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1466                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
1467                xGPU);
1468 
1469   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1470   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1471   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1472   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1473   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1474   PetscFunctionReturn(0);
1475 }
1476 
1477 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1478 {
1479   const PetscScalar                 *barray;
1480   PetscScalar                       *xarray;
1481   cusparseStatus_t                  stat;
1482   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1483   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1484   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1485   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1486   PetscErrorCode                    ierr;
1487   cudaError_t                       cerr;
1488 
1489   PetscFunctionBegin;
1490   /* Get the GPU pointers */
1491   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1492   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1493 
1494   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1495   /* First, solve L */
1496   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1497                         loTriFactor->csrMat->num_rows,
1498                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1499                         loTriFactor->csrMat->num_entries,
1500                       #endif
1501                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1502                         loTriFactor->csrMat->values->data().get(),
1503                         loTriFactor->csrMat->row_offsets->data().get(),
1504                         loTriFactor->csrMat->column_indices->data().get(),
1505                         loTriFactor->solveInfo,
1506                         barray, tempGPU->data().get()
1507                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1508                         ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1509                       #endif
1510                         );CHKERRCUSPARSE(stat);
1511 
1512   /* Next, solve U */
1513   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1514                         upTriFactor->csrMat->num_rows,
1515                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1516                         upTriFactor->csrMat->num_entries,
1517                       #endif
1518                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1519                         upTriFactor->csrMat->values->data().get(),
1520                         upTriFactor->csrMat->row_offsets->data().get(),
1521                         upTriFactor->csrMat->column_indices->data().get(),
1522                         upTriFactor->solveInfo,
1523                         tempGPU->data().get(), xarray
1524                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1525                         ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1526                       #endif
1527                         );CHKERRCUSPARSE(stat);
1528 
1529   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1530   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1531   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1532   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1533   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1534   PetscFunctionReturn(0);
1535 }
1536 
1537 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
1538 {
1539   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1540   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1541   cudaError_t        cerr;
1542   PetscErrorCode     ierr;
1543 
1544   PetscFunctionBegin;
1545   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
1546     CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat;
1547 
1548     ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
1549     cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1550     cerr = WaitForCUDA();CHKERRCUDA(cerr);
1551     ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr);
1552     ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
1553     A->offloadmask = PETSC_OFFLOAD_BOTH;
1554   }
1555   PetscFunctionReturn(0);
1556 }
1557 
1558 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1559 {
1560   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
1561   PetscErrorCode ierr;
1562 
1563   PetscFunctionBegin;
1564   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
1565   *array = a->a;
1566   A->offloadmask = PETSC_OFFLOAD_CPU;
1567   PetscFunctionReturn(0);
1568 }
1569 
1570 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1571 {
1572   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1573   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
1574   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1575   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
1576   PetscErrorCode               ierr;
1577   cusparseStatus_t             stat;
1578   cudaError_t                  err;
1579 
1580   PetscFunctionBegin;
1581   if (A->boundtocpu) PetscFunctionReturn(0);
1582   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1583     if (A->was_assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) {
1584       /* Copy values only */
1585       CsrMatrix *matrix,*matrixT;
1586       matrix = (CsrMatrix*)cusparsestruct->mat->mat;
1587 
1588       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1589       matrix->values->assign(a->a, a->a+a->nz);
1590       err  = WaitForCUDA();CHKERRCUDA(err);
1591       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
1592       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1593 
1594       /* Update matT when it was built before */
1595       if (cusparsestruct->matTranspose) {
1596         cusparseIndexBase_t indexBase = cusparseGetMatIndexBase(cusparsestruct->mat->descr);
1597         matrixT = (CsrMatrix*)cusparsestruct->matTranspose->mat;
1598         ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1599         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1600                             A->cmap->n, matrix->num_entries,
1601                             matrix->values->data().get(),
1602                             cusparsestruct->rowoffsets_gpu->data().get(),
1603                             matrix->column_indices->data().get(),
1604                             matrixT->values->data().get(),
1605                           #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1606                             matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1607                             CUSPARSE_ACTION_NUMERIC,indexBase,
1608                             cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer
1609                           #else
1610                             matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1611                             CUSPARSE_ACTION_NUMERIC, indexBase
1612                           #endif
1613                            );CHKERRCUSPARSE(stat);
1614         err  = WaitForCUDA();CHKERRCUDA(err);
1615         ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1616       }
1617     } else {
1618       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1619       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr);
1620       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->matTranspose,cusparsestruct->format);CHKERRQ(ierr);
1621       delete cusparsestruct->workVector;
1622       delete cusparsestruct->rowoffsets_gpu;
1623       try {
1624         if (a->compressedrow.use) {
1625           m    = a->compressedrow.nrows;
1626           ii   = a->compressedrow.i;
1627           ridx = a->compressedrow.rindex;
1628         } else {
1629           m    = A->rmap->n;
1630           ii   = a->i;
1631           ridx = NULL;
1632         }
1633         cusparsestruct->nrows = m;
1634 
1635         /* create cusparse matrix */
1636         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1637         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
1638         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
1639         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1640 
1641         err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1642         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1643         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1644         err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1645         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1646         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1647         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1648 
1649         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1650         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1651           /* set the matrix */
1652           CsrMatrix *mat= new CsrMatrix;
1653           mat->num_rows = m;
1654           mat->num_cols = A->cmap->n;
1655           mat->num_entries = a->nz;
1656           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1657           mat->row_offsets->assign(ii, ii + m+1);
1658 
1659           mat->column_indices = new THRUSTINTARRAY32(a->nz);
1660           mat->column_indices->assign(a->j, a->j+a->nz);
1661 
1662           mat->values = new THRUSTARRAY(a->nz);
1663           mat->values->assign(a->a, a->a+a->nz);
1664 
1665           /* assign the pointer */
1666           matstruct->mat = mat;
1667          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1668           if (mat->num_rows) { /* cusparse errors on empty matrices! */
1669             stat = cusparseCreateCsr(&matstruct->matDescr,
1670                                     mat->num_rows, mat->num_cols, mat->num_entries,
1671                                     mat->row_offsets->data().get(), mat->column_indices->data().get(),
1672                                     mat->values->data().get(),
1673                                     CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1674                                     CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
1675           }
1676          #endif
1677         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1678          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1679           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1680          #else
1681           CsrMatrix *mat= new CsrMatrix;
1682           mat->num_rows = m;
1683           mat->num_cols = A->cmap->n;
1684           mat->num_entries = a->nz;
1685           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1686           mat->row_offsets->assign(ii, ii + m+1);
1687 
1688           mat->column_indices = new THRUSTINTARRAY32(a->nz);
1689           mat->column_indices->assign(a->j, a->j+a->nz);
1690 
1691           mat->values = new THRUSTARRAY(a->nz);
1692           mat->values->assign(a->a, a->a+a->nz);
1693 
1694           cusparseHybMat_t hybMat;
1695           stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1696           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1697             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1698           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
1699               matstruct->descr, mat->values->data().get(),
1700               mat->row_offsets->data().get(),
1701               mat->column_indices->data().get(),
1702               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1703           /* assign the pointer */
1704           matstruct->mat = hybMat;
1705 
1706           if (mat) {
1707             if (mat->values) delete (THRUSTARRAY*)mat->values;
1708             if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1709             if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1710             delete (CsrMatrix*)mat;
1711           }
1712          #endif
1713         }
1714 
1715         /* assign the compressed row indices */
1716         if (a->compressedrow.use) {
1717           cusparsestruct->workVector = new THRUSTARRAY(m);
1718           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
1719           matstruct->cprowIndices->assign(ridx,ridx+m);
1720           tmp = m;
1721         } else {
1722           cusparsestruct->workVector = NULL;
1723           matstruct->cprowIndices    = NULL;
1724           tmp = 0;
1725         }
1726         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
1727 
1728         /* assign the pointer */
1729         cusparsestruct->mat = matstruct;
1730       } catch(char *ex) {
1731         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1732       }
1733       err  = WaitForCUDA();CHKERRCUDA(err);
1734       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1735       cusparsestruct->nonzerostate = A->nonzerostate;
1736     }
1737     A->offloadmask = PETSC_OFFLOAD_BOTH;
1738   }
1739   PetscFunctionReturn(0);
1740 }
1741 
1742 struct VecCUDAPlusEquals
1743 {
1744   template <typename Tuple>
1745   __host__ __device__
1746   void operator()(Tuple t)
1747   {
1748     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1749   }
1750 };
1751 
1752 struct VecCUDAEquals
1753 {
1754   template <typename Tuple>
1755   __host__ __device__
1756   void operator()(Tuple t)
1757   {
1758     thrust::get<1>(t) = thrust::get<0>(t);
1759   }
1760 };
1761 
1762 struct VecCUDAEqualsReverse
1763 {
1764   template <typename Tuple>
1765   __host__ __device__
1766   void operator()(Tuple t)
1767   {
1768     thrust::get<0>(t) = thrust::get<1>(t);
1769   }
1770 };
1771 
1772 struct MatMatCusparse {
1773   PetscBool            cisdense;
1774   PetscScalar          *Bt;
1775   Mat                  X;
1776 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1777   PetscBool            initialized;   /* C = alpha op(A) op(B) + beta C */
1778   cusparseDnMatDescr_t matBDescr;
1779   cusparseDnMatDescr_t matCDescr;
1780   size_t               spmmBufferSize;
1781   void                 *spmmBuffer;
1782   PetscInt             Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
1783 #endif
1784 };
1785 
1786 static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
1787 {
1788   PetscErrorCode ierr;
1789   MatMatCusparse *mmdata = (MatMatCusparse *)data;
1790   cudaError_t    cerr;
1791 
1792   PetscFunctionBegin;
1793   cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr);
1794  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1795   cusparseStatus_t stat;
1796   if (mmdata->matBDescr)  {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat);}
1797   if (mmdata->matCDescr)  {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat);}
1798   if (mmdata->spmmBuffer) {cerr = cudaFree(mmdata->spmmBuffer);CHKERRCUDA(cerr);}
1799  #endif
1800   ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr);
1801   ierr = PetscFree(data);CHKERRQ(ierr);
1802   PetscFunctionReturn(0);
1803 }
1804 
1805 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
1806 
1807 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1808 {
1809   Mat_Product                  *product = C->product;
1810   Mat                          A,B;
1811   PetscInt                     m,n,blda,clda;
1812   PetscBool                    flg,biscuda;
1813   Mat_SeqAIJCUSPARSE           *cusp;
1814   cusparseStatus_t             stat;
1815   cusparseOperation_t          opA;
1816   const PetscScalar            *barray;
1817   PetscScalar                  *carray;
1818   PetscErrorCode               ierr;
1819   MatMatCusparse               *mmdata;
1820   Mat_SeqAIJCUSPARSEMultStruct *mat;
1821   CsrMatrix                    *csrmat;
1822   cudaError_t                  cerr;
1823 
1824   PetscFunctionBegin;
1825   MatCheckProduct(C,1);
1826   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
1827   mmdata = (MatMatCusparse*)product->data;
1828   A    = product->A;
1829   B    = product->B;
1830   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
1831   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
1832   /* currently CopyToGpu does not copy if the matrix is bound to CPU
1833      Instead of silently accepting the wrong answer, I prefer to raise the error */
1834   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
1835   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1836   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1837   switch (product->type) {
1838   case MATPRODUCT_AB:
1839   case MATPRODUCT_PtAP:
1840     mat = cusp->mat;
1841     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1842     m   = A->rmap->n;
1843     n   = B->cmap->n;
1844     break;
1845   case MATPRODUCT_AtB:
1846     if (!cusp->transgen) {
1847       mat = cusp->mat;
1848       opA = CUSPARSE_OPERATION_TRANSPOSE;
1849     } else {
1850       ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1851       mat  = cusp->matTranspose;
1852       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
1853     }
1854     m = A->cmap->n;
1855     n = B->cmap->n;
1856     break;
1857   case MATPRODUCT_ABt:
1858   case MATPRODUCT_RARt:
1859     mat = cusp->mat;
1860     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1861     m   = A->rmap->n;
1862     n   = B->rmap->n;
1863     break;
1864   default:
1865     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
1866   }
1867   if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct");
1868   csrmat = (CsrMatrix*)mat->mat;
1869   /* if the user passed a CPU matrix, copy the data to the GPU */
1870   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr);
1871   if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);}
1872   ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr);
1873 
1874   ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr);
1875   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
1876     ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
1877     ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr);
1878   } else {
1879     ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr);
1880     ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr);
1881   }
1882 
1883   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1884  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1885   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
1886   /* (re)allcoate spmmBuffer if not initialized or LDAs are different */
1887   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
1888     if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;}
1889     if (!mmdata->matBDescr) {
1890       stat         = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
1891       mmdata->Blda = blda;
1892     }
1893 
1894     if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;}
1895     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
1896       stat         = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
1897       mmdata->Clda = clda;
1898     }
1899 
1900     if (!mat->matDescr) {
1901       stat = cusparseCreateCsr(&mat->matDescr,
1902                               csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
1903                               csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
1904                               csrmat->values->data().get(),
1905                               CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1906                               CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
1907     }
1908     stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
1909                                    mat->matDescr,mmdata->matBDescr,mat->beta_zero,
1910                                    mmdata->matCDescr,cusparse_scalartype,
1911                                    cusp->spmmAlg,&mmdata->spmmBufferSize);CHKERRCUSPARSE(stat);
1912     if (mmdata->spmmBuffer) {cerr = cudaFree(mmdata->spmmBuffer);CHKERRCUDA(cerr);}
1913     cerr = cudaMalloc(&mmdata->spmmBuffer,mmdata->spmmBufferSize);CHKERRCUDA(cerr);
1914     mmdata->initialized = PETSC_TRUE;
1915   } else {
1916     /* to be safe, always update pointers of the mats */
1917     stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat);
1918     stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat);
1919     stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat);
1920   }
1921 
1922   /* do cusparseSpMM, which supports transpose on B */
1923   stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
1924                       mat->matDescr,mmdata->matBDescr,mat->beta_zero,
1925                       mmdata->matCDescr,cusparse_scalartype,
1926                       cusp->spmmAlg,mmdata->spmmBuffer);CHKERRCUSPARSE(stat);
1927  #else
1928   PetscInt k;
1929   /* cusparseXcsrmm does not support transpose on B */
1930   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
1931     cublasHandle_t cublasv2handle;
1932     cublasStatus_t cerr;
1933 
1934     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
1935     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
1936                        B->cmap->n,B->rmap->n,
1937                        &PETSC_CUSPARSE_ONE ,barray,blda,
1938                        &PETSC_CUSPARSE_ZERO,barray,blda,
1939                        mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr);
1940     blda = B->cmap->n;
1941     k    = B->cmap->n;
1942   } else {
1943     k    = B->rmap->n;
1944   }
1945 
1946   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
1947   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
1948                            csrmat->num_entries,mat->alpha_one,mat->descr,
1949                            csrmat->values->data().get(),
1950                            csrmat->row_offsets->data().get(),
1951                            csrmat->column_indices->data().get(),
1952                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
1953                            carray,clda);CHKERRCUSPARSE(stat);
1954  #endif
1955   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1956   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1957   ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr);
1958   ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr);
1959   if (product->type == MATPRODUCT_RARt) {
1960     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
1961     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
1962   } else if (product->type == MATPRODUCT_PtAP) {
1963     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
1964     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
1965   } else {
1966     ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr);
1967   }
1968   if (mmdata->cisdense) {
1969     ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr);
1970   }
1971   if (!biscuda) {
1972     ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
1973   }
1974   PetscFunctionReturn(0);
1975 }
1976 
1977 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1978 {
1979   Mat_Product        *product = C->product;
1980   Mat                A,B;
1981   PetscInt           m,n;
1982   PetscBool          cisdense,flg;
1983   PetscErrorCode     ierr;
1984   MatMatCusparse     *mmdata;
1985   Mat_SeqAIJCUSPARSE *cusp;
1986 
1987   PetscFunctionBegin;
1988   MatCheckProduct(C,1);
1989   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
1990   A    = product->A;
1991   B    = product->B;
1992   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
1993   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
1994   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1995   if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
1996   switch (product->type) {
1997   case MATPRODUCT_AB:
1998     m = A->rmap->n;
1999     n = B->cmap->n;
2000     break;
2001   case MATPRODUCT_AtB:
2002     m = A->cmap->n;
2003     n = B->cmap->n;
2004     break;
2005   case MATPRODUCT_ABt:
2006     m = A->rmap->n;
2007     n = B->rmap->n;
2008     break;
2009   case MATPRODUCT_PtAP:
2010     m = B->cmap->n;
2011     n = B->cmap->n;
2012     break;
2013   case MATPRODUCT_RARt:
2014     m = B->rmap->n;
2015     n = B->rmap->n;
2016     break;
2017   default:
2018     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2019   }
2020   ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2021   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2022   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr);
2023   ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr);
2024 
2025   /* product data */
2026   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2027   mmdata->cisdense = cisdense;
2028  #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
2029   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2030   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2031     cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
2032   }
2033  #endif
2034   /* for these products we need intermediate storage */
2035   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2036     ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr);
2037     ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr);
2038     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2039       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr);
2040     } else {
2041       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr);
2042     }
2043   }
2044   C->product->data    = mmdata;
2045   C->product->destroy = MatDestroy_MatMatCusparse;
2046 
2047   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2048   PetscFunctionReturn(0);
2049 }
2050 
2051 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
2052 
2053 /* handles dense B */
2054 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat C)
2055 {
2056   Mat_Product    *product = C->product;
2057   PetscErrorCode ierr;
2058 
2059   PetscFunctionBegin;
2060   MatCheckProduct(C,1);
2061   if (!product->A) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing A");
2062   if (product->A->boundtocpu) {
2063     ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(C);CHKERRQ(ierr);
2064     PetscFunctionReturn(0);
2065   }
2066   switch (product->type) {
2067   case MATPRODUCT_AB:
2068   case MATPRODUCT_AtB:
2069   case MATPRODUCT_ABt:
2070   case MATPRODUCT_PtAP:
2071   case MATPRODUCT_RARt:
2072     C->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2073   default:
2074     break;
2075   }
2076   PetscFunctionReturn(0);
2077 }
2078 
2079 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2080 {
2081   PetscErrorCode ierr;
2082 
2083   PetscFunctionBegin;
2084   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2085   PetscFunctionReturn(0);
2086 }
2087 
2088 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
2089 {
2090   PetscErrorCode ierr;
2091 
2092   PetscFunctionBegin;
2093   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2094   PetscFunctionReturn(0);
2095 }
2096 
2097 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2098 {
2099   PetscErrorCode ierr;
2100 
2101   PetscFunctionBegin;
2102   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
2103   PetscFunctionReturn(0);
2104 }
2105 
2106 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2107 {
2108   PetscErrorCode ierr;
2109 
2110   PetscFunctionBegin;
2111   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
2112   PetscFunctionReturn(0);
2113 }
2114 
2115 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2116 {
2117   PetscErrorCode ierr;
2118 
2119   PetscFunctionBegin;
2120   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2121   PetscFunctionReturn(0);
2122 }
2123 
2124 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
2125 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
2126 {
2127   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
2128   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
2129   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
2130   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
2131   PetscErrorCode               ierr;
2132   cudaError_t                  cerr;
2133   cusparseStatus_t             stat;
2134   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2135   PetscBool                    compressed;
2136 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2137   PetscInt                     nx,ny;
2138 #endif
2139 
2140   PetscFunctionBegin;
2141   if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Hermitian and not transpose not supported");
2142   if (!a->nonzerorowcnt) {
2143     if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);}
2144     PetscFunctionReturn(0);
2145   }
2146   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
2147   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2148   if (!trans) {
2149     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2150     if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
2151   } else {
2152     if (herm || !cusparsestruct->transgen) {
2153       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
2154       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2155     } else {
2156       if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);}
2157       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
2158     }
2159   }
2160   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
2161   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
2162 
2163   try {
2164     ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
2165     if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */
2166     else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */
2167 
2168     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2169     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2170       /* z = A x + beta y.
2171          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
2172          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
2173       */
2174       xptr = xarray;
2175       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
2176       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
2177      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2178       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
2179           allocated to accommodate different uses. So we get the length info directly from mat.
2180        */
2181       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2182         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2183         nx = mat->num_cols;
2184         ny = mat->num_rows;
2185       }
2186      #endif
2187     } else {
2188       /* z = A^T x + beta y
2189          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
2190          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
2191        */
2192       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
2193       dptr = zarray;
2194       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
2195       if (compressed) { /* Scatter x to work vector */
2196         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
2197         thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
2198                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2199                          VecCUDAEqualsReverse());
2200       }
2201      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2202       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2203         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2204         nx = mat->num_rows;
2205         ny = mat->num_cols;
2206       }
2207      #endif
2208     }
2209 
2210     /* csr_spmv does y = alpha op(A) x + beta y */
2211     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2212      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2213       if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
2214       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
2215         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2216         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2217         stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
2218                                 matstruct->matDescr,
2219                                 matstruct->cuSpMV[opA].vecXDescr, beta,
2220                                 matstruct->cuSpMV[opA].vecYDescr,
2221                                 cusparse_scalartype,
2222                                 cusparsestruct->spmvAlg,
2223                                 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat);
2224         cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr);
2225 
2226         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
2227       } else {
2228         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
2229         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat);
2230         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat);
2231       }
2232 
2233       stat = cusparseSpMV(cusparsestruct->handle, opA,
2234                                matstruct->alpha_one,
2235                                matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEGenerateTransposeForMult() */
2236                                matstruct->cuSpMV[opA].vecXDescr,
2237                                beta,
2238                                matstruct->cuSpMV[opA].vecYDescr,
2239                                cusparse_scalartype,
2240                                cusparsestruct->spmvAlg,
2241                                matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat);
2242      #else
2243       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2244       stat = cusparse_csr_spmv(cusparsestruct->handle, opA,
2245                                mat->num_rows, mat->num_cols,
2246                                mat->num_entries, matstruct->alpha_one, matstruct->descr,
2247                                mat->values->data().get(), mat->row_offsets->data().get(),
2248                                mat->column_indices->data().get(), xptr, beta,
2249                                dptr);CHKERRCUSPARSE(stat);
2250      #endif
2251     } else {
2252       if (cusparsestruct->nrows) {
2253        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2254         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2255        #else
2256         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
2257         stat = cusparse_hyb_spmv(cusparsestruct->handle, opA,
2258                                  matstruct->alpha_one, matstruct->descr, hybMat,
2259                                  xptr, beta,
2260                                  dptr);CHKERRCUSPARSE(stat);
2261        #endif
2262       }
2263     }
2264     cerr = WaitForCUDA();CHKERRCUDA(cerr);
2265     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2266 
2267     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2268       if (yy) { /* MatMultAdd: zz = A*xx + yy */
2269         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
2270           ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */
2271         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
2272           ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
2273         }
2274       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
2275         ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
2276       }
2277 
2278       /* ScatterAdd the result from work vector into the full vector when A is compressed */
2279       if (compressed) {
2280         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
2281         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2282         thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
2283                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2284                          VecCUDAPlusEquals());
2285         cerr = WaitForCUDA();CHKERRCUDA(cerr);
2286         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2287       }
2288     } else {
2289       if (yy && yy != zz) {
2290         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
2291       }
2292     }
2293     ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
2294     if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);}
2295     else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);}
2296   } catch(char *ex) {
2297     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
2298   }
2299   if (yy) {
2300     ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
2301   } else {
2302     ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr);
2303   }
2304   PetscFunctionReturn(0);
2305 }
2306 
2307 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2308 {
2309   PetscErrorCode ierr;
2310 
2311   PetscFunctionBegin;
2312   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2313   PetscFunctionReturn(0);
2314 }
2315 
2316 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
2317 {
2318   PetscErrorCode              ierr;
2319   PetscSplitCSRDataStructure  *d_mat = NULL, h_mat;
2320   PetscBool                   is_seq = PETSC_TRUE;
2321   PetscInt                    nnz_state = A->nonzerostate;
2322   PetscFunctionBegin;
2323   if (A->factortype == MAT_FACTOR_NONE) {
2324     d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
2325   }
2326   if (d_mat) {
2327     cudaError_t err;
2328     ierr = PetscInfo(A,"Assemble device matrix\n");CHKERRQ(ierr);
2329     err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
2330     nnz_state = h_mat.nonzerostate;
2331     is_seq = h_mat.seq;
2332   }
2333   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); // this does very little if assembled on GPU - call it?
2334   if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0);
2335   if (A->factortype == MAT_FACTOR_NONE && A->nonzerostate >= nnz_state && is_seq) { // assembled on CPU eventhough equiped for GPU
2336     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2337   } else if (nnz_state > A->nonzerostate) {
2338     A->offloadmask = PETSC_OFFLOAD_GPU;
2339   }
2340 
2341   PetscFunctionReturn(0);
2342 }
2343 
2344 /* --------------------------------------------------------------------------------*/
2345 /*@
2346    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
2347    (the default parallel PETSc format). This matrix will ultimately pushed down
2348    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
2349    assembly performance the user should preallocate the matrix storage by setting
2350    the parameter nz (or the array nnz).  By setting these parameters accurately,
2351    performance during matrix assembly can be increased by more than a factor of 50.
2352 
2353    Collective
2354 
2355    Input Parameters:
2356 +  comm - MPI communicator, set to PETSC_COMM_SELF
2357 .  m - number of rows
2358 .  n - number of columns
2359 .  nz - number of nonzeros per row (same for all rows)
2360 -  nnz - array containing the number of nonzeros in the various rows
2361          (possibly different for each row) or NULL
2362 
2363    Output Parameter:
2364 .  A - the matrix
2365 
2366    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
2367    MatXXXXSetPreallocation() paradgm instead of this routine directly.
2368    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
2369 
2370    Notes:
2371    If nnz is given then nz is ignored
2372 
2373    The AIJ format (also called the Yale sparse matrix format or
2374    compressed row storage), is fully compatible with standard Fortran 77
2375    storage.  That is, the stored row and column indices can begin at
2376    either one (as in Fortran) or zero.  See the users' manual for details.
2377 
2378    Specify the preallocated storage with either nz or nnz (not both).
2379    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
2380    allocation.  For large problems you MUST preallocate memory or you
2381    will get TERRIBLE performance, see the users' manual chapter on matrices.
2382 
2383    By default, this format uses inodes (identical nodes) when possible, to
2384    improve numerical efficiency of matrix-vector products and solves. We
2385    search for consecutive rows with the same nonzero structure, thereby
2386    reusing matrix information to achieve increased efficiency.
2387 
2388    Level: intermediate
2389 
2390 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
2391 @*/
2392 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
2393 {
2394   PetscErrorCode ierr;
2395 
2396   PetscFunctionBegin;
2397   ierr = MatCreate(comm,A);CHKERRQ(ierr);
2398   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
2399   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2400   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
2401   PetscFunctionReturn(0);
2402 }
2403 
2404 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
2405 {
2406   PetscErrorCode              ierr;
2407   PetscSplitCSRDataStructure  *d_mat = NULL;
2408 
2409   PetscFunctionBegin;
2410   if (A->factortype == MAT_FACTOR_NONE) {
2411     d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
2412     ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat = NULL;
2413     ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
2414   } else {
2415     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
2416   }
2417   if (d_mat) {
2418     Mat_SeqAIJ                 *a = (Mat_SeqAIJ*)A->data;
2419     cudaError_t                err;
2420     PetscSplitCSRDataStructure h_mat;
2421     ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr);
2422     err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
2423     if (h_mat.seq) {
2424       if (a->compressedrow.use) {
2425  	err = cudaFree(h_mat.diag.i);CHKERRCUDA(err);
2426       }
2427       err = cudaFree(d_mat);CHKERRCUDA(err);
2428     }
2429   }
2430   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
2431   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
2432   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
2433   ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
2434   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
2435   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
2436   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
2437   PetscFunctionReturn(0);
2438 }
2439 
2440 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
2441 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
2442 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
2443 {
2444   PetscErrorCode ierr;
2445 
2446   PetscFunctionBegin;
2447   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
2448   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
2449   PetscFunctionReturn(0);
2450 }
2451 
2452 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
2453 {
2454   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
2455   PetscErrorCode ierr;
2456 
2457   PetscFunctionBegin;
2458   if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0);
2459   if (flg) {
2460     ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
2461 
2462     A->ops->mult                      = MatMult_SeqAIJ;
2463     A->ops->multadd                   = MatMultAdd_SeqAIJ;
2464     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
2465     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
2466     A->ops->multhermitiantranspose    = NULL;
2467     A->ops->multhermitiantransposeadd = NULL;
2468     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
2469     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
2470     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
2471     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
2472     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr);
2473   } else {
2474     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
2475     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
2476     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
2477     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
2478     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
2479     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
2480     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
2481     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
2482     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
2483     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
2484     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);CHKERRQ(ierr);
2485   }
2486   A->boundtocpu = flg;
2487   a->inode.use = flg;
2488   PetscFunctionReturn(0);
2489 }
2490 
2491 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
2492 {
2493   PetscSplitCSRDataStructure *d_mat = NULL;
2494   PetscErrorCode             ierr;
2495   PetscBool                  both = PETSC_FALSE;
2496 
2497   PetscFunctionBegin;
2498   if (A->factortype == MAT_FACTOR_NONE) {
2499     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
2500     if (spptr->mat) {
2501       CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat;
2502       if (matrix->values) {
2503         both = PETSC_TRUE;
2504         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
2505       }
2506     }
2507     if (spptr->matTranspose) {
2508       CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat;
2509       if (matrix->values) {
2510         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
2511       }
2512     }
2513     d_mat = spptr->deviceMat;
2514   }
2515   if (d_mat) {
2516     Mat_SeqAIJ   *a = (Mat_SeqAIJ*)A->data;
2517     PetscInt     n = A->rmap->n, nnz = a->i[n];
2518     cudaError_t  err;
2519     PetscScalar  *vals;
2520     ierr = PetscInfo(A,"Zero device matrix\n");CHKERRQ(ierr);
2521     err = cudaMemcpy( &vals, &d_mat->diag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
2522     err = cudaMemset( vals, 0, (nnz)*sizeof(PetscScalar));CHKERRCUDA(err);
2523   }
2524   ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr);
2525   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
2526 
2527   PetscFunctionReturn(0);
2528 }
2529 
2530 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
2531 {
2532   PetscErrorCode   ierr;
2533   cusparseStatus_t stat;
2534   Mat              B;
2535 
2536   PetscFunctionBegin;
2537   if (reuse == MAT_INITIAL_MATRIX) {
2538     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
2539   } else if (reuse == MAT_REUSE_MATRIX) {
2540     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
2541   }
2542   B = *newmat;
2543 
2544   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
2545   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
2546 
2547   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
2548     if (B->factortype == MAT_FACTOR_NONE) {
2549       Mat_SeqAIJCUSPARSE *spptr;
2550 
2551       ierr = PetscNew(&spptr);CHKERRQ(ierr);
2552       spptr->format = MAT_CUSPARSE_CSR;
2553       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
2554       B->spptr = spptr;
2555       spptr->deviceMat = NULL;
2556     } else {
2557       Mat_SeqAIJCUSPARSETriFactors *spptr;
2558 
2559       ierr = PetscNew(&spptr);CHKERRQ(ierr);
2560       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
2561       B->spptr = spptr;
2562     }
2563     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
2564   }
2565   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
2566   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
2567   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
2568   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
2569   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
2570   B->ops->zeroentries    = MatZeroEntries_SeqAIJCUSPARSE;
2571 
2572   ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr);
2573   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2574   ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
2575   PetscFunctionReturn(0);
2576 }
2577 
2578 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
2579 {
2580   PetscErrorCode ierr;
2581 
2582   PetscFunctionBegin;
2583   ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr);
2584   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
2585   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
2586   ierr = PetscObjectOptionsBegin((PetscObject)B);CHKERRQ(ierr);
2587   ierr = MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionsObject,B);CHKERRQ(ierr);
2588   ierr = PetscOptionsEnd();CHKERRQ(ierr);
2589   PetscFunctionReturn(0);
2590 }
2591 
2592 /*MC
2593    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
2594 
2595    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
2596    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
2597    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
2598 
2599    Options Database Keys:
2600 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
2601 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
2602 -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
2603 
2604   Level: beginner
2605 
2606 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
2607 M*/
2608 
2609 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);
2610 
2611 
2612 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
2613 {
2614   PetscErrorCode ierr;
2615 
2616   PetscFunctionBegin;
2617   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
2618   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
2619   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
2620   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
2621   PetscFunctionReturn(0);
2622 }
2623 
2624 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
2625 {
2626   PetscErrorCode   ierr;
2627   cusparseStatus_t stat;
2628 
2629   PetscFunctionBegin;
2630   if (*cusparsestruct) {
2631     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr);
2632     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr);
2633     delete (*cusparsestruct)->workVector;
2634     delete (*cusparsestruct)->rowoffsets_gpu;
2635     delete (*cusparsestruct)->cooPerm;
2636     delete (*cusparsestruct)->cooPerm_a;
2637     delete (*cusparsestruct)->cooPerm_v;
2638     delete (*cusparsestruct)->cooPerm_w;
2639     if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);}
2640    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2641     cudaError_t cerr = cudaFree((*cusparsestruct)->csr2cscBuffer);CHKERRCUDA(cerr);
2642    #endif
2643     ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr);
2644   }
2645   PetscFunctionReturn(0);
2646 }
2647 
2648 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
2649 {
2650   PetscFunctionBegin;
2651   if (*mat) {
2652     delete (*mat)->values;
2653     delete (*mat)->column_indices;
2654     delete (*mat)->row_offsets;
2655     delete *mat;
2656     *mat = 0;
2657   }
2658   PetscFunctionReturn(0);
2659 }
2660 
2661 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
2662 {
2663   cusparseStatus_t stat;
2664   PetscErrorCode   ierr;
2665 
2666   PetscFunctionBegin;
2667   if (*trifactor) {
2668     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
2669     if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
2670     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
2671    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2672     cudaError_t cerr;
2673     if ((*trifactor)->solveBuffer)   {cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);}
2674     if ((*trifactor)->csr2cscBuffer) {cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);}
2675    #endif
2676     delete *trifactor;
2677     *trifactor = NULL;
2678   }
2679   PetscFunctionReturn(0);
2680 }
2681 
2682 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
2683 {
2684   CsrMatrix        *mat;
2685   cusparseStatus_t stat;
2686   cudaError_t      err;
2687 
2688   PetscFunctionBegin;
2689   if (*matstruct) {
2690     if ((*matstruct)->mat) {
2691       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
2692        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2693         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2694        #else
2695         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
2696         stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
2697        #endif
2698       } else {
2699         mat = (CsrMatrix*)(*matstruct)->mat;
2700         CsrMatrix_Destroy(&mat);
2701       }
2702     }
2703     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
2704     delete (*matstruct)->cprowIndices;
2705     if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); }
2706     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
2707     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
2708 
2709    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2710     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
2711     if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);}
2712     for (int i=0; i<3; i++) {
2713       if (mdata->cuSpMV[i].initialized) {
2714         err  = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err);
2715         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat);
2716         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat);
2717       }
2718     }
2719    #endif
2720     delete *matstruct;
2721     *matstruct = NULL;
2722   }
2723   PetscFunctionReturn(0);
2724 }
2725 
2726 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors)
2727 {
2728   PetscErrorCode ierr;
2729 
2730   PetscFunctionBegin;
2731   if (*trifactors) {
2732     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr);
2733     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr);
2734     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr);
2735     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr);
2736     delete (*trifactors)->rpermIndices;
2737     delete (*trifactors)->cpermIndices;
2738     delete (*trifactors)->workVector;
2739     (*trifactors)->rpermIndices = NULL;
2740     (*trifactors)->cpermIndices = NULL;
2741     (*trifactors)->workVector = NULL;
2742   }
2743   PetscFunctionReturn(0);
2744 }
2745 
2746 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
2747 {
2748   PetscErrorCode   ierr;
2749   cusparseHandle_t handle;
2750   cusparseStatus_t stat;
2751 
2752   PetscFunctionBegin;
2753   if (*trifactors) {
2754     ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr);
2755     if (handle = (*trifactors)->handle) {
2756       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
2757     }
2758     ierr = PetscFree(*trifactors);CHKERRQ(ierr);
2759   }
2760   PetscFunctionReturn(0);
2761 }
2762 
2763 struct IJCompare
2764 {
2765   __host__ __device__
2766   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
2767   {
2768     if (t1.get<0>() < t2.get<0>()) return true;
2769     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
2770     return false;
2771   }
2772 };
2773 
2774 struct IJEqual
2775 {
2776   __host__ __device__
2777   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
2778   {
2779     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
2780     return true;
2781   }
2782 };
2783 
2784 struct IJDiff
2785 {
2786   __host__ __device__
2787   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
2788   {
2789     return t1 == t2 ? 0 : 1;
2790   }
2791 };
2792 
2793 struct IJSum
2794 {
2795   __host__ __device__
2796   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
2797   {
2798     return t1||t2;
2799   }
2800 };
2801 
2802 #include <thrust/iterator/discard_iterator.h>
2803 PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
2804 {
2805   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2806   CsrMatrix          *matrix;
2807   PetscErrorCode     ierr;
2808   cudaError_t        cerr;
2809   PetscInt           n;
2810 
2811   PetscFunctionBegin;
2812   if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct");
2813   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix");
2814   if (!cusp->cooPerm) {
2815     ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2816     ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2817     PetscFunctionReturn(0);
2818   }
2819   n = cusp->cooPerm->size();
2820   matrix = (CsrMatrix*)cusp->mat->mat;
2821   if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
2822   if (!cusp->cooPerm_v) { cusp->cooPerm_v = new THRUSTARRAY(n); }
2823   ierr = PetscLogEventBegin(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr);
2824   if (v) {
2825     cusp->cooPerm_v->assign(v,v+n);
2826     ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
2827   }
2828   else thrust::fill(thrust::device,cusp->cooPerm_v->begin(),cusp->cooPerm_v->end(),0.);
2829   if (imode == ADD_VALUES) {
2830     if (cusp->cooPerm_a) {
2831       if (!cusp->cooPerm_w) cusp->cooPerm_w = new THRUSTARRAY(matrix->values->size());
2832       auto vbit = thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->begin());
2833       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cusp->cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
2834       thrust::transform(cusp->cooPerm_w->begin(),cusp->cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>());
2835     } else {
2836       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->begin()),
2837                                                                 matrix->values->begin()));
2838       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->end()),
2839                                                                 matrix->values->end()));
2840       thrust::for_each(zibit,zieit,VecCUDAPlusEquals());
2841     }
2842   } else {
2843     if (cusp->cooPerm_a) { /* non unique values insertion, result is undefined (we cannot guarantee last takes precedence)
2844                               if we are inserting two different values into the same location */
2845       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->begin()),
2846                                                                 thrust::make_permutation_iterator(matrix->values->begin(),cusp->cooPerm_a->begin())));
2847       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->end()),
2848                                                                 thrust::make_permutation_iterator(matrix->values->begin(),cusp->cooPerm_a->end())));
2849       thrust::for_each(zibit,zieit,VecCUDAEquals());
2850     } else {
2851       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->begin()),
2852                                                                 matrix->values->begin()));
2853       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->end()),
2854                                                                 matrix->values->end()));
2855       thrust::for_each(zibit,zieit,VecCUDAEquals());
2856     }
2857   }
2858   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2859   ierr = PetscLogEventEnd(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr);
2860   A->offloadmask = PETSC_OFFLOAD_GPU;
2861   ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2862   ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2863   /* we can remove this call when MatSeqAIJGetArray operations are used everywhere! */
2864   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
2865   PetscFunctionReturn(0);
2866 }
2867 
2868 #include <thrust/binary_search.h>
2869 PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
2870 {
2871   PetscErrorCode     ierr;
2872   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2873   CsrMatrix          *matrix;
2874   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
2875   PetscInt           cooPerm_n, nzr = 0;
2876   cudaError_t        cerr;
2877 
2878   PetscFunctionBegin;
2879   ierr = PetscLogEventBegin(MAT_CUSPARSEPreallCOO,A,0,0,0);CHKERRQ(ierr);
2880   ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr);
2881   ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr);
2882   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
2883   if (n != cooPerm_n) {
2884     delete cusp->cooPerm;
2885     delete cusp->cooPerm_v;
2886     delete cusp->cooPerm_w;
2887     delete cusp->cooPerm_a;
2888     cusp->cooPerm = NULL;
2889     cusp->cooPerm_v = NULL;
2890     cusp->cooPerm_w = NULL;
2891     cusp->cooPerm_a = NULL;
2892   }
2893   if (n) {
2894     THRUSTINTARRAY d_i(n);
2895     THRUSTINTARRAY d_j(n);
2896     THRUSTINTARRAY ii(A->rmap->n);
2897 
2898     if (!cusp->cooPerm)   { cusp->cooPerm   = new THRUSTINTARRAY(n); }
2899     if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); }
2900 
2901     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
2902     d_i.assign(coo_i,coo_i+n);
2903     d_j.assign(coo_j,coo_j+n);
2904     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin()));
2905     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end()));
2906 
2907     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
2908     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare());
2909     *cusp->cooPerm_a = d_i;
2910     THRUSTINTARRAY w = d_j;
2911 
2912     auto nekey = thrust::unique(fkey, ekey, IJEqual());
2913     if (nekey == ekey) { /* all entries are unique */
2914       delete cusp->cooPerm_a;
2915       cusp->cooPerm_a = NULL;
2916     } else { /* I couldn't come up with a more elegant algorithm */
2917       adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff());
2918       adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff());
2919       (*cusp->cooPerm_a)[0] = 0;
2920       w[0] = 0;
2921       thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum());
2922       thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>());
2923     }
2924     thrust::counting_iterator<PetscInt> search_begin(0);
2925     thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(),
2926                         search_begin, search_begin + A->rmap->n,
2927                         ii.begin());
2928 
2929     ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr);
2930     a->singlemalloc = PETSC_FALSE;
2931     a->free_a       = PETSC_TRUE;
2932     a->free_ij      = PETSC_TRUE;
2933     ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr);
2934     a->i[0] = 0;
2935     cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2936     a->nz = a->maxnz = a->i[A->rmap->n];
2937     ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr);
2938     ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr);
2939     cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2940     if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); }
2941     if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); }
2942     for (PetscInt i = 0; i < A->rmap->n; i++) {
2943       const PetscInt nnzr = a->i[i+1] - a->i[i];
2944       nzr += (PetscInt)!!(nnzr);
2945       a->ilen[i] = a->imax[i] = nnzr;
2946     }
2947     A->preallocated = PETSC_TRUE;
2948     ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr);
2949   } else {
2950     ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr);
2951   }
2952   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2953   ierr = PetscLogEventEnd(MAT_CUSPARSEPreallCOO,A,0,0,0);CHKERRQ(ierr);
2954 
2955   /* We want to allocate the CUSPARSE struct for matvec now.
2956      The code is so convoluted now that I prefer to copy garbage to the GPU */
2957   ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr);
2958   A->offloadmask = PETSC_OFFLOAD_CPU;
2959   A->nonzerostate++;
2960   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2961   {
2962     matrix = (CsrMatrix*)cusp->mat->mat;
2963     if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
2964     thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
2965     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr);
2966   }
2967 
2968   A->offloadmask = PETSC_OFFLOAD_CPU;
2969   A->assembled = PETSC_FALSE;
2970   A->was_assembled = PETSC_FALSE;
2971   PetscFunctionReturn(0);
2972 }
2973