xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 5f80ce2ab25dff0f4601e710601cbbcecf323266)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
7 
8 #include <petscconf.h>
9 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
10 #include <../src/mat/impls/sbaij/seq/sbaij.h>
11 #include <../src/vec/vec/impls/dvecimpl.h>
12 #include <petsc/private/vecimpl.h>
13 #undef VecType
14 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
15 #include <thrust/adjacent_difference.h>
16 #include <thrust/async/for_each.h>
17 #include <thrust/iterator/constant_iterator.h>
18 #include <thrust/remove.h>
19 #include <thrust/sort.h>
20 #include <thrust/unique.h>
21 
22 const char *const MatCUSPARSEStorageFormats[]    = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
23 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
24   /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
25     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
26 
27   typedef enum {
28       CUSPARSE_MV_ALG_DEFAULT = 0,
29       CUSPARSE_COOMV_ALG      = 1,
30       CUSPARSE_CSRMV_ALG1     = 2,
31       CUSPARSE_CSRMV_ALG2     = 3
32   } cusparseSpMVAlg_t;
33 
34   typedef enum {
35       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
36       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
37       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
38       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
39       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
40       CUSPARSE_SPMM_ALG_DEFAULT = 0,
41       CUSPARSE_SPMM_COO_ALG1    = 1,
42       CUSPARSE_SPMM_COO_ALG2    = 2,
43       CUSPARSE_SPMM_COO_ALG3    = 3,
44       CUSPARSE_SPMM_COO_ALG4    = 5,
45       CUSPARSE_SPMM_CSR_ALG1    = 4,
46       CUSPARSE_SPMM_CSR_ALG2    = 6,
47   } cusparseSpMMAlg_t;
48 
49   typedef enum {
50       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
51       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
52   } cusparseCsr2CscAlg_t;
53   */
54   const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
55   const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
56   const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
57 #endif
58 
59 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
60 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
61 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
62 
63 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
64 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
65 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
66 
67 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
68 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
69 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
70 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
71 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
72 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure);
73 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar);
74 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
75 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
76 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
77 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
78 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
79 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
80 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
81 
82 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
83 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
84 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
85 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
86 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
87 
88 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
89 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool);
90 
91 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]);
92 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscCount,const PetscInt[],const PetscInt[]);
93 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode);
94 
95 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
96 {
97   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
98 
99   PetscFunctionBegin;
100   PetscCheckFalse(!cusparsestruct,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
101   cusparsestruct->stream = stream;
102   CHKERRCUSPARSE(cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream));
103   PetscFunctionReturn(0);
104 }
105 
106 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
107 {
108   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
109 
110   PetscFunctionBegin;
111   PetscCheckFalse(!cusparsestruct,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
112   if (cusparsestruct->handle != handle) {
113     if (cusparsestruct->handle) {
114       CHKERRCUSPARSE(cusparseDestroy(cusparsestruct->handle));
115     }
116     cusparsestruct->handle = handle;
117   }
118   CHKERRCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE));
119   PetscFunctionReturn(0);
120 }
121 
122 PetscErrorCode MatCUSPARSEClearHandle(Mat A)
123 {
124   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
125   PetscBool          flg;
126 
127   PetscFunctionBegin;
128   CHKERRQ(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
129   if (!flg || !cusparsestruct) PetscFunctionReturn(0);
130   if (cusparsestruct->handle) cusparsestruct->handle = 0;
131   PetscFunctionReturn(0);
132 }
133 
134 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
135 {
136   PetscFunctionBegin;
137   *type = MATSOLVERCUSPARSE;
138   PetscFunctionReturn(0);
139 }
140 
141 /*MC
142   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
143   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
144   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
145   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
146   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
147   algorithms are not recommended. This class does NOT support direct solver operations.
148 
149   Level: beginner
150 
151 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
152 M*/
153 
154 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
155 {
156   PetscInt       n = A->rmap->n;
157 
158   PetscFunctionBegin;
159   CHKERRQ(MatCreate(PetscObjectComm((PetscObject)A),B));
160   CHKERRQ(MatSetSizes(*B,n,n,n,n));
161   (*B)->factortype = ftype;
162   CHKERRQ(MatSetType(*B,MATSEQAIJCUSPARSE));
163 
164   if (A->boundtocpu && A->bindingpropagates) CHKERRQ(MatBindToCPU(*B,PETSC_TRUE));
165   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
166     CHKERRQ(MatSetBlockSizesFromMats(*B,A,A));
167     if (!A->boundtocpu) {
168       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
169       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
170     } else {
171       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
172       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJ;
173     }
174     CHKERRQ(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]));
175     CHKERRQ(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU]));
176     CHKERRQ(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
177   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
178     if (!A->boundtocpu) {
179       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
180       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
181     } else {
182       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJ;
183       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
184     }
185     CHKERRQ(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
186     CHKERRQ(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC]));
187   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
188 
189   CHKERRQ(MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL));
190   (*B)->canuseordering = PETSC_TRUE;
191   CHKERRQ(PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse));
192   PetscFunctionReturn(0);
193 }
194 
195 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
196 {
197   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
198 
199   PetscFunctionBegin;
200   switch (op) {
201   case MAT_CUSPARSE_MULT:
202     cusparsestruct->format = format;
203     break;
204   case MAT_CUSPARSE_ALL:
205     cusparsestruct->format = format;
206     break;
207   default:
208     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
209   }
210   PetscFunctionReturn(0);
211 }
212 
213 /*@
214    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
215    operation. Only the MatMult operation can use different GPU storage formats
216    for MPIAIJCUSPARSE matrices.
217    Not Collective
218 
219    Input Parameters:
220 +  A - Matrix of type SEQAIJCUSPARSE
221 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
222 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
223 
224    Output Parameter:
225 
226    Level: intermediate
227 
228 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
229 @*/
230 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
231 {
232   PetscFunctionBegin;
233   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
234   CHKERRQ(PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format)));
235   PetscFunctionReturn(0);
236 }
237 
238 PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A,PetscBool use_cpu)
239 {
240   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
241 
242   PetscFunctionBegin;
243   cusparsestruct->use_cpu_solve = use_cpu;
244   PetscFunctionReturn(0);
245 }
246 
247 /*@
248    MatCUSPARSESetUseCPUSolve - Sets use CPU MatSolve.
249 
250    Input Parameters:
251 +  A - Matrix of type SEQAIJCUSPARSE
252 -  use_cpu - set flag for using the built-in CPU MatSolve
253 
254    Output Parameter:
255 
256    Notes:
257    The cuSparse LU solver currently computes the factors with the built-in CPU method
258    and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there.
259    This method to specify if the solve is done on the CPU or GPU (GPU is the default).
260 
261    Level: intermediate
262 
263 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
264 @*/
265 PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A,PetscBool use_cpu)
266 {
267   PetscFunctionBegin;
268   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
269   CHKERRQ(PetscTryMethod(A,"MatCUSPARSESetUseCPUSolve_C",(Mat,PetscBool),(A,use_cpu)));
270   PetscFunctionReturn(0);
271 }
272 
273 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg)
274 {
275   PetscFunctionBegin;
276   switch (op) {
277     case MAT_FORM_EXPLICIT_TRANSPOSE:
278       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
279       if (A->form_explicit_transpose && !flg) CHKERRQ(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE));
280       A->form_explicit_transpose = flg;
281       break;
282     default:
283       CHKERRQ(MatSetOption_SeqAIJ(A,op,flg));
284       break;
285   }
286   PetscFunctionReturn(0);
287 }
288 
289 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A);
290 
291 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
292 {
293   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
294   IS             isrow = b->row,iscol = b->col;
295   PetscBool      row_identity,col_identity;
296   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)B->spptr;
297 
298   PetscFunctionBegin;
299   CHKERRQ(MatSeqAIJCUSPARSECopyFromGPU(A));
300   CHKERRQ(MatLUFactorNumeric_SeqAIJ(B,A,info));
301   B->offloadmask = PETSC_OFFLOAD_CPU;
302   /* determine which version of MatSolve needs to be used. */
303   CHKERRQ(ISIdentity(isrow,&row_identity));
304   CHKERRQ(ISIdentity(iscol,&col_identity));
305   if (row_identity && col_identity) {
306     if (!cusparsestruct->use_cpu_solve) {
307       B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
308       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
309     }
310     B->ops->matsolve = NULL;
311     B->ops->matsolvetranspose = NULL;
312   } else {
313     if (!cusparsestruct->use_cpu_solve) {
314       B->ops->solve = MatSolve_SeqAIJCUSPARSE;
315       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
316     }
317     B->ops->matsolve = NULL;
318     B->ops->matsolvetranspose = NULL;
319   }
320 
321   /* get the triangular factors */
322   if (!cusparsestruct->use_cpu_solve) {
323     CHKERRQ(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B));
324   }
325   PetscFunctionReturn(0);
326 }
327 
328 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
329 {
330   PetscErrorCode           ierr;
331   MatCUSPARSEStorageFormat format;
332   PetscBool                flg;
333   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
334 
335   PetscFunctionBegin;
336   CHKERRQ(PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options"));
337   if (A->factortype == MAT_FACTOR_NONE) {
338     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
339                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
340     if (flg) CHKERRQ(MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format));
341 
342     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
343                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
344     if (flg) CHKERRQ(MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format));
345     CHKERRQ(PetscOptionsBool("-mat_cusparse_use_cpu_solve","Use CPU (I)LU solve","MatCUSPARSESetUseCPUSolve",cusparsestruct->use_cpu_solve,&cusparsestruct->use_cpu_solve,&flg));
346     if (flg) CHKERRQ(MatCUSPARSESetUseCPUSolve(A,cusparsestruct->use_cpu_solve));
347 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
348     ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
349                             "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr);
350     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
351 #if PETSC_PKG_CUDA_VERSION_GE(11,2,0)
352     PetscCheckFalse(flg && CUSPARSE_SPMV_CSR_ALG1 != 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
353 #else
354     PetscCheckFalse(flg && CUSPARSE_CSRMV_ALG1 != 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
355 #endif
356     ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
357                             "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr);
358     PetscCheckFalse(flg && CUSPARSE_SPMM_CSR_ALG1 != 4,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
359 
360     ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
361                             "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr);
362     PetscCheckFalse(flg && CUSPARSE_CSR2CSC_ALG1 != 1,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
363    #endif
364   }
365   CHKERRQ(PetscOptionsTail());
366   PetscFunctionReturn(0);
367 }
368 
369 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
370 {
371   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
372 
373   PetscFunctionBegin;
374   CHKERRQ(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
375   CHKERRQ(MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info));
376   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
377   PetscFunctionReturn(0);
378 }
379 
380 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
381 {
382   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
383 
384   PetscFunctionBegin;
385   CHKERRQ(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
386   CHKERRQ(MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info));
387   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
388   PetscFunctionReturn(0);
389 }
390 
391 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
392 {
393   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
394 
395   PetscFunctionBegin;
396   CHKERRQ(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
397   CHKERRQ(MatICCFactorSymbolic_SeqAIJ(B,A,perm,info));
398   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
399   PetscFunctionReturn(0);
400 }
401 
402 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
403 {
404   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
405 
406   PetscFunctionBegin;
407   CHKERRQ(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
408   CHKERRQ(MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info));
409   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
410   PetscFunctionReturn(0);
411 }
412 
413 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
414 {
415   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
416   PetscInt                          n = A->rmap->n;
417   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
418   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
419   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
420   const MatScalar                   *aa = a->a,*v;
421   PetscInt                          *AiLo, *AjLo;
422   PetscInt                          i,nz, nzLower, offset, rowOffset;
423 
424   PetscFunctionBegin;
425   if (!n) PetscFunctionReturn(0);
426   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
427     try {
428       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
429       nzLower=n+ai[n]-ai[1];
430       if (!loTriFactor) {
431         PetscScalar                       *AALo;
432 
433         CHKERRCUDA(cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar)));
434 
435         /* Allocate Space for the lower triangular matrix */
436         CHKERRCUDA(cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt)));
437         CHKERRCUDA(cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt)));
438 
439         /* Fill the lower triangular matrix */
440         AiLo[0]  = (PetscInt) 0;
441         AiLo[n]  = nzLower;
442         AjLo[0]  = (PetscInt) 0;
443         AALo[0]  = (MatScalar) 1.0;
444         v        = aa;
445         vi       = aj;
446         offset   = 1;
447         rowOffset= 1;
448         for (i=1; i<n; i++) {
449           nz = ai[i+1] - ai[i];
450           /* additional 1 for the term on the diagonal */
451           AiLo[i]    = rowOffset;
452           rowOffset += nz+1;
453 
454           CHKERRQ(PetscArraycpy(&(AjLo[offset]), vi, nz));
455           CHKERRQ(PetscArraycpy(&(AALo[offset]), v, nz));
456 
457           offset      += nz;
458           AjLo[offset] = (PetscInt) i;
459           AALo[offset] = (MatScalar) 1.0;
460           offset      += 1;
461 
462           v  += nz;
463           vi += nz;
464         }
465 
466         /* allocate space for the triangular factor information */
467         CHKERRQ(PetscNew(&loTriFactor));
468         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
469         /* Create the matrix description */
470         CHKERRCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
471         CHKERRCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
472        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
473         CHKERRCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
474        #else
475         CHKERRCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
476        #endif
477         CHKERRCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER));
478         CHKERRCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
479 
480         /* set the operation */
481         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
482 
483         /* set the matrix */
484         loTriFactor->csrMat = new CsrMatrix;
485         loTriFactor->csrMat->num_rows = n;
486         loTriFactor->csrMat->num_cols = n;
487         loTriFactor->csrMat->num_entries = nzLower;
488 
489         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
490         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
491 
492         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
493         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
494 
495         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
496         loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
497 
498         /* Create the solve analysis information */
499         CHKERRQ(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
500         CHKERRCUSPARSE(cusparse_create_analysis_info(&loTriFactor->solveInfo));
501       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
502         CHKERRCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
503                                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
504                                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
505                                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
506                                                &loTriFactor->solveBufferSize));
507         CHKERRCUDA(cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize));
508       #endif
509 
510         /* perform the solve analysis */
511         CHKERRCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
512                                          loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
513                                          loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
514                                          loTriFactor->csrMat->column_indices->data().get(),
515                                          #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
516                                          loTriFactor->solveInfo,
517                                          loTriFactor->solvePolicy, loTriFactor->solveBuffer));
518                                          #else
519                                          loTriFactor->solveInfo));
520                                          #endif
521         CHKERRCUDA(WaitForCUDA());
522         CHKERRQ(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
523 
524         /* assign the pointer */
525         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
526         loTriFactor->AA_h = AALo;
527         CHKERRCUDA(cudaFreeHost(AiLo));
528         CHKERRCUDA(cudaFreeHost(AjLo));
529         CHKERRQ(PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar)));
530       } else { /* update values only */
531         if (!loTriFactor->AA_h) {
532           CHKERRCUDA(cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar)));
533         }
534         /* Fill the lower triangular matrix */
535         loTriFactor->AA_h[0]  = 1.0;
536         v        = aa;
537         vi       = aj;
538         offset   = 1;
539         for (i=1; i<n; i++) {
540           nz = ai[i+1] - ai[i];
541           CHKERRQ(PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz));
542           offset      += nz;
543           loTriFactor->AA_h[offset] = 1.0;
544           offset      += 1;
545           v  += nz;
546         }
547         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower);
548         CHKERRQ(PetscLogCpuToGpu(nzLower*sizeof(PetscScalar)));
549       }
550     } catch(char *ex) {
551       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
552     }
553   }
554   PetscFunctionReturn(0);
555 }
556 
557 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
558 {
559   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
560   PetscInt                          n = A->rmap->n;
561   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
562   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
563   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
564   const MatScalar                   *aa = a->a,*v;
565   PetscInt                          *AiUp, *AjUp;
566   PetscInt                          i,nz, nzUpper, offset;
567 
568   PetscFunctionBegin;
569   if (!n) PetscFunctionReturn(0);
570   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
571     try {
572       /* next, figure out the number of nonzeros in the upper triangular matrix. */
573       nzUpper = adiag[0]-adiag[n];
574       if (!upTriFactor) {
575         PetscScalar *AAUp;
576 
577         CHKERRCUDA(cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar)));
578 
579         /* Allocate Space for the upper triangular matrix */
580         CHKERRCUDA(cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt)));
581         CHKERRCUDA(cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt)));
582 
583         /* Fill the upper triangular matrix */
584         AiUp[0]=(PetscInt) 0;
585         AiUp[n]=nzUpper;
586         offset = nzUpper;
587         for (i=n-1; i>=0; i--) {
588           v  = aa + adiag[i+1] + 1;
589           vi = aj + adiag[i+1] + 1;
590 
591           /* number of elements NOT on the diagonal */
592           nz = adiag[i] - adiag[i+1]-1;
593 
594           /* decrement the offset */
595           offset -= (nz+1);
596 
597           /* first, set the diagonal elements */
598           AjUp[offset] = (PetscInt) i;
599           AAUp[offset] = (MatScalar)1./v[nz];
600           AiUp[i]      = AiUp[i+1] - (nz+1);
601 
602           CHKERRQ(PetscArraycpy(&(AjUp[offset+1]), vi, nz));
603           CHKERRQ(PetscArraycpy(&(AAUp[offset+1]), v, nz));
604         }
605 
606         /* allocate space for the triangular factor information */
607         CHKERRQ(PetscNew(&upTriFactor));
608         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
609 
610         /* Create the matrix description */
611         CHKERRCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
612         CHKERRCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
613        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
614         CHKERRCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
615        #else
616         CHKERRCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
617        #endif
618         CHKERRCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
619         CHKERRCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
620 
621         /* set the operation */
622         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
623 
624         /* set the matrix */
625         upTriFactor->csrMat = new CsrMatrix;
626         upTriFactor->csrMat->num_rows = n;
627         upTriFactor->csrMat->num_cols = n;
628         upTriFactor->csrMat->num_entries = nzUpper;
629 
630         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
631         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
632 
633         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
634         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
635 
636         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
637         upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
638 
639         /* Create the solve analysis information */
640         CHKERRQ(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
641         CHKERRCUSPARSE(cusparse_create_analysis_info(&upTriFactor->solveInfo));
642       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
643         CHKERRCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
644                                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
645                                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
646                                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
647                                                &upTriFactor->solveBufferSize));
648         CHKERRCUDA(cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize));
649       #endif
650 
651         /* perform the solve analysis */
652         CHKERRCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
653                                          upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
654                                          upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
655                                          upTriFactor->csrMat->column_indices->data().get(),
656                                          #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
657                                          upTriFactor->solveInfo,
658                                          upTriFactor->solvePolicy, upTriFactor->solveBuffer));
659                                          #else
660                                          upTriFactor->solveInfo));
661                                          #endif
662         CHKERRCUDA(WaitForCUDA());
663         CHKERRQ(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
664 
665         /* assign the pointer */
666         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
667         upTriFactor->AA_h = AAUp;
668         CHKERRCUDA(cudaFreeHost(AiUp));
669         CHKERRCUDA(cudaFreeHost(AjUp));
670         CHKERRQ(PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar)));
671       } else {
672         if (!upTriFactor->AA_h) {
673           CHKERRCUDA(cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar)));
674         }
675         /* Fill the upper triangular matrix */
676         offset = nzUpper;
677         for (i=n-1; i>=0; i--) {
678           v  = aa + adiag[i+1] + 1;
679 
680           /* number of elements NOT on the diagonal */
681           nz = adiag[i] - adiag[i+1]-1;
682 
683           /* decrement the offset */
684           offset -= (nz+1);
685 
686           /* first, set the diagonal elements */
687           upTriFactor->AA_h[offset] = 1./v[nz];
688           CHKERRQ(PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz));
689         }
690         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper);
691         CHKERRQ(PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar)));
692       }
693     } catch(char *ex) {
694       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
695     }
696   }
697   PetscFunctionReturn(0);
698 }
699 
700 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
701 {
702   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
703   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
704   IS                           isrow = a->row,iscol = a->icol;
705   PetscBool                    row_identity,col_identity;
706   PetscInt                     n = A->rmap->n;
707 
708   PetscFunctionBegin;
709   PetscCheckFalse(!cusparseTriFactors,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
710   CHKERRQ(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A));
711   CHKERRQ(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A));
712 
713   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
714   cusparseTriFactors->nnz=a->nz;
715 
716   A->offloadmask = PETSC_OFFLOAD_BOTH;
717   /* lower triangular indices */
718   CHKERRQ(ISIdentity(isrow,&row_identity));
719   if (!row_identity && !cusparseTriFactors->rpermIndices) {
720     const PetscInt *r;
721 
722     CHKERRQ(ISGetIndices(isrow,&r));
723     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
724     cusparseTriFactors->rpermIndices->assign(r, r+n);
725     CHKERRQ(ISRestoreIndices(isrow,&r));
726     CHKERRQ(PetscLogCpuToGpu(n*sizeof(PetscInt)));
727   }
728 
729   /* upper triangular indices */
730   CHKERRQ(ISIdentity(iscol,&col_identity));
731   if (!col_identity && !cusparseTriFactors->cpermIndices) {
732     const PetscInt *c;
733 
734     CHKERRQ(ISGetIndices(iscol,&c));
735     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
736     cusparseTriFactors->cpermIndices->assign(c, c+n);
737     CHKERRQ(ISRestoreIndices(iscol,&c));
738     CHKERRQ(PetscLogCpuToGpu(n*sizeof(PetscInt)));
739   }
740   PetscFunctionReturn(0);
741 }
742 
743 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
744 {
745   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
746   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
747   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
748   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
749   PetscInt                          *AiUp, *AjUp;
750   PetscScalar                       *AAUp;
751   PetscScalar                       *AALo;
752   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
753   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
754   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
755   const MatScalar                   *aa = b->a,*v;
756 
757   PetscFunctionBegin;
758   if (!n) PetscFunctionReturn(0);
759   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
760     try {
761       CHKERRCUDA(cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar)));
762       CHKERRCUDA(cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar)));
763       if (!upTriFactor && !loTriFactor) {
764         /* Allocate Space for the upper triangular matrix */
765         CHKERRCUDA(cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt)));
766         CHKERRCUDA(cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt)));
767 
768         /* Fill the upper triangular matrix */
769         AiUp[0]=(PetscInt) 0;
770         AiUp[n]=nzUpper;
771         offset = 0;
772         for (i=0; i<n; i++) {
773           /* set the pointers */
774           v  = aa + ai[i];
775           vj = aj + ai[i];
776           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
777 
778           /* first, set the diagonal elements */
779           AjUp[offset] = (PetscInt) i;
780           AAUp[offset] = (MatScalar)1.0/v[nz];
781           AiUp[i]      = offset;
782           AALo[offset] = (MatScalar)1.0/v[nz];
783 
784           offset+=1;
785           if (nz>0) {
786             CHKERRQ(PetscArraycpy(&(AjUp[offset]), vj, nz));
787             CHKERRQ(PetscArraycpy(&(AAUp[offset]), v, nz));
788             for (j=offset; j<offset+nz; j++) {
789               AAUp[j] = -AAUp[j];
790               AALo[j] = AAUp[j]/v[nz];
791             }
792             offset+=nz;
793           }
794         }
795 
796         /* allocate space for the triangular factor information */
797         CHKERRQ(PetscNew(&upTriFactor));
798         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
799 
800         /* Create the matrix description */
801         CHKERRCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
802         CHKERRCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
803        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
804         CHKERRCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
805        #else
806         CHKERRCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
807        #endif
808         CHKERRCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
809         CHKERRCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
810 
811         /* set the matrix */
812         upTriFactor->csrMat = new CsrMatrix;
813         upTriFactor->csrMat->num_rows = A->rmap->n;
814         upTriFactor->csrMat->num_cols = A->cmap->n;
815         upTriFactor->csrMat->num_entries = a->nz;
816 
817         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
818         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
819 
820         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
821         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
822 
823         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
824         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
825 
826         /* set the operation */
827         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
828 
829         /* Create the solve analysis information */
830         CHKERRQ(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
831         CHKERRCUSPARSE(cusparse_create_analysis_info(&upTriFactor->solveInfo));
832       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
833         CHKERRCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
834                                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
835                                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
836                                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
837                                                &upTriFactor->solveBufferSize));
838         CHKERRCUDA(cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize));
839       #endif
840 
841         /* perform the solve analysis */
842         CHKERRCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
843                                          upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
844                                          upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
845                                          upTriFactor->csrMat->column_indices->data().get(),
846                                          #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
847                                          upTriFactor->solveInfo,
848                                          upTriFactor->solvePolicy, upTriFactor->solveBuffer));
849                                          #else
850                                          upTriFactor->solveInfo));
851                                          #endif
852         CHKERRCUDA(WaitForCUDA());
853         CHKERRQ(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
854 
855         /* assign the pointer */
856         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
857 
858         /* allocate space for the triangular factor information */
859         CHKERRQ(PetscNew(&loTriFactor));
860         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
861 
862         /* Create the matrix description */
863         CHKERRCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
864         CHKERRCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
865        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
866         CHKERRCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
867        #else
868         CHKERRCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
869        #endif
870         CHKERRCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
871         CHKERRCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
872 
873         /* set the operation */
874         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
875 
876         /* set the matrix */
877         loTriFactor->csrMat = new CsrMatrix;
878         loTriFactor->csrMat->num_rows = A->rmap->n;
879         loTriFactor->csrMat->num_cols = A->cmap->n;
880         loTriFactor->csrMat->num_entries = a->nz;
881 
882         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
883         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
884 
885         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
886         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
887 
888         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
889         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
890 
891         /* Create the solve analysis information */
892         CHKERRQ(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
893         CHKERRCUSPARSE(cusparse_create_analysis_info(&loTriFactor->solveInfo));
894       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
895         CHKERRCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
896                                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
897                                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
898                                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
899                                                &loTriFactor->solveBufferSize));
900         CHKERRCUDA(cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize));
901       #endif
902 
903         /* perform the solve analysis */
904         CHKERRCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
905                                          loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
906                                          loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
907                                          loTriFactor->csrMat->column_indices->data().get(),
908                                          #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
909                                          loTriFactor->solveInfo,
910                                          loTriFactor->solvePolicy, loTriFactor->solveBuffer));
911                                          #else
912                                          loTriFactor->solveInfo));
913                                          #endif
914         CHKERRCUDA(WaitForCUDA());
915         CHKERRQ(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
916 
917         /* assign the pointer */
918         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
919 
920         CHKERRQ(PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar))));
921         CHKERRCUDA(cudaFreeHost(AiUp));
922         CHKERRCUDA(cudaFreeHost(AjUp));
923       } else {
924         /* Fill the upper triangular matrix */
925         offset = 0;
926         for (i=0; i<n; i++) {
927           /* set the pointers */
928           v  = aa + ai[i];
929           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
930 
931           /* first, set the diagonal elements */
932           AAUp[offset] = 1.0/v[nz];
933           AALo[offset] = 1.0/v[nz];
934 
935           offset+=1;
936           if (nz>0) {
937             CHKERRQ(PetscArraycpy(&(AAUp[offset]), v, nz));
938             for (j=offset; j<offset+nz; j++) {
939               AAUp[j] = -AAUp[j];
940               AALo[j] = AAUp[j]/v[nz];
941             }
942             offset+=nz;
943           }
944         }
945         PetscCheckFalse(!upTriFactor,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
946         PetscCheckFalse(!loTriFactor,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
947         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
948         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
949         CHKERRQ(PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar)));
950       }
951       CHKERRCUDA(cudaFreeHost(AAUp));
952       CHKERRCUDA(cudaFreeHost(AALo));
953     } catch(char *ex) {
954       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
955     }
956   }
957   PetscFunctionReturn(0);
958 }
959 
960 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
961 {
962   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
963   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
964   IS                           ip = a->row;
965   PetscBool                    perm_identity;
966   PetscInt                     n = A->rmap->n;
967 
968   PetscFunctionBegin;
969   PetscCheckFalse(!cusparseTriFactors,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
970   CHKERRQ(MatSeqAIJCUSPARSEBuildICCTriMatrices(A));
971   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
972   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
973 
974   A->offloadmask = PETSC_OFFLOAD_BOTH;
975 
976   /* lower triangular indices */
977   CHKERRQ(ISIdentity(ip,&perm_identity));
978   if (!perm_identity) {
979     IS             iip;
980     const PetscInt *irip,*rip;
981 
982     CHKERRQ(ISInvertPermutation(ip,PETSC_DECIDE,&iip));
983     CHKERRQ(ISGetIndices(iip,&irip));
984     CHKERRQ(ISGetIndices(ip,&rip));
985     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
986     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
987     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
988     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
989     CHKERRQ(ISRestoreIndices(iip,&irip));
990     CHKERRQ(ISDestroy(&iip));
991     CHKERRQ(ISRestoreIndices(ip,&rip));
992     CHKERRQ(PetscLogCpuToGpu(2.*n*sizeof(PetscInt)));
993   }
994   PetscFunctionReturn(0);
995 }
996 
997 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
998 {
999   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
1000   IS             ip = b->row;
1001   PetscBool      perm_identity;
1002 
1003   PetscFunctionBegin;
1004   CHKERRQ(MatSeqAIJCUSPARSECopyFromGPU(A));
1005   CHKERRQ(MatCholeskyFactorNumeric_SeqAIJ(B,A,info));
1006   B->offloadmask = PETSC_OFFLOAD_CPU;
1007   /* determine which version of MatSolve needs to be used. */
1008   CHKERRQ(ISIdentity(ip,&perm_identity));
1009   if (perm_identity) {
1010     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
1011     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
1012     B->ops->matsolve = NULL;
1013     B->ops->matsolvetranspose = NULL;
1014   } else {
1015     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
1016     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
1017     B->ops->matsolve = NULL;
1018     B->ops->matsolvetranspose = NULL;
1019   }
1020 
1021   /* get the triangular factors */
1022   CHKERRQ(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B));
1023   PetscFunctionReturn(0);
1024 }
1025 
1026 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
1027 {
1028   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1029   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1030   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1031   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
1032   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
1033   cusparseIndexBase_t               indexBase;
1034   cusparseMatrixType_t              matrixType;
1035   cusparseFillMode_t                fillMode;
1036   cusparseDiagType_t                diagType;
1037 
1038   PetscFunctionBegin;
1039   /* allocate space for the transpose of the lower triangular factor */
1040   CHKERRQ(PetscNew(&loTriFactorT));
1041   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1042 
1043   /* set the matrix descriptors of the lower triangular factor */
1044   matrixType = cusparseGetMatType(loTriFactor->descr);
1045   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
1046   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1047     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1048   diagType = cusparseGetMatDiagType(loTriFactor->descr);
1049 
1050   /* Create the matrix description */
1051   CHKERRCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr));
1052   CHKERRCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase));
1053   CHKERRCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType));
1054   CHKERRCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode));
1055   CHKERRCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType));
1056 
1057   /* set the operation */
1058   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1059 
1060   /* allocate GPU space for the CSC of the lower triangular factor*/
1061   loTriFactorT->csrMat = new CsrMatrix;
1062   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1063   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1064   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1065   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
1066   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1067   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1068 
1069   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1070 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1071   CHKERRCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1072                                                loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1073                                                loTriFactor->csrMat->values->data().get(),
1074                                                loTriFactor->csrMat->row_offsets->data().get(),
1075                                                loTriFactor->csrMat->column_indices->data().get(),
1076                                                loTriFactorT->csrMat->values->data().get(),
1077                                                loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1078                                                CUSPARSE_ACTION_NUMERIC,indexBase,
1079                                                CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize));
1080   CHKERRCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize));
1081 #endif
1082 
1083   CHKERRQ(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
1084   CHKERRCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1085                                   loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1086                                   loTriFactor->csrMat->values->data().get(),
1087                                   loTriFactor->csrMat->row_offsets->data().get(),
1088                                   loTriFactor->csrMat->column_indices->data().get(),
1089                                   loTriFactorT->csrMat->values->data().get(),
1090                                   #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1091                                   loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1092                                   CUSPARSE_ACTION_NUMERIC, indexBase,
1093                                   CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer));
1094                                   #else
1095                                   loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1096                                   CUSPARSE_ACTION_NUMERIC, indexBase));
1097                                   #endif
1098   CHKERRCUDA(WaitForCUDA());
1099   CHKERRQ(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
1100 
1101   /* Create the solve analysis information */
1102   CHKERRQ(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
1103   CHKERRCUSPARSE(cusparse_create_analysis_info(&loTriFactorT->solveInfo));
1104 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1105   CHKERRCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
1106                                          loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1107                                          loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1108                                          loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
1109                                          &loTriFactorT->solveBufferSize));
1110   CHKERRCUDA(cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize));
1111 #endif
1112 
1113   /* perform the solve analysis */
1114   CHKERRCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
1115                                    loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1116                                    loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1117                                    loTriFactorT->csrMat->column_indices->data().get(),
1118                                    #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1119                                    loTriFactorT->solveInfo,
1120                                    loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1121                                    #else
1122                                    loTriFactorT->solveInfo));
1123                                    #endif
1124   CHKERRCUDA(WaitForCUDA());
1125   CHKERRQ(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
1126 
1127   /* assign the pointer */
1128   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1129 
1130   /*********************************************/
1131   /* Now the Transpose of the Upper Tri Factor */
1132   /*********************************************/
1133 
1134   /* allocate space for the transpose of the upper triangular factor */
1135   CHKERRQ(PetscNew(&upTriFactorT));
1136   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1137 
1138   /* set the matrix descriptors of the upper triangular factor */
1139   matrixType = cusparseGetMatType(upTriFactor->descr);
1140   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1141   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1142     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1143   diagType = cusparseGetMatDiagType(upTriFactor->descr);
1144 
1145   /* Create the matrix description */
1146   CHKERRCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr));
1147   CHKERRCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase));
1148   CHKERRCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType));
1149   CHKERRCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode));
1150   CHKERRCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType));
1151 
1152   /* set the operation */
1153   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1154 
1155   /* allocate GPU space for the CSC of the upper triangular factor*/
1156   upTriFactorT->csrMat = new CsrMatrix;
1157   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1158   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1159   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1160   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1161   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1162   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1163 
1164   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1165 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1166   CHKERRCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1167                                                upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1168                                                upTriFactor->csrMat->values->data().get(),
1169                                                upTriFactor->csrMat->row_offsets->data().get(),
1170                                                upTriFactor->csrMat->column_indices->data().get(),
1171                                                upTriFactorT->csrMat->values->data().get(),
1172                                                upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1173                                                CUSPARSE_ACTION_NUMERIC,indexBase,
1174                                                CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize));
1175   CHKERRCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize));
1176 #endif
1177 
1178   CHKERRQ(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
1179   CHKERRCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1180                                   upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1181                                   upTriFactor->csrMat->values->data().get(),
1182                                   upTriFactor->csrMat->row_offsets->data().get(),
1183                                   upTriFactor->csrMat->column_indices->data().get(),
1184                                   upTriFactorT->csrMat->values->data().get(),
1185                                   #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1186                                   upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1187                                   CUSPARSE_ACTION_NUMERIC, indexBase,
1188                                   CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer));
1189                                   #else
1190                                   upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1191                                  CUSPARSE_ACTION_NUMERIC, indexBase));
1192                                  #endif
1193 
1194   CHKERRCUDA(WaitForCUDA());
1195   CHKERRQ(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
1196 
1197   /* Create the solve analysis information */
1198   CHKERRQ(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
1199   CHKERRCUSPARSE(cusparse_create_analysis_info(&upTriFactorT->solveInfo));
1200   #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1201   CHKERRCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1202                                          upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1203                                          upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1204                                          upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
1205                                          &upTriFactorT->solveBufferSize));
1206   CHKERRCUDA(cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize));
1207   #endif
1208 
1209   /* perform the solve analysis */
1210   /* christ, would it have killed you to put this stuff in a function????????? */
1211   CHKERRCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1212                                    upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1213                                    upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1214                                    upTriFactorT->csrMat->column_indices->data().get(),
1215                                    #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1216                                    upTriFactorT->solveInfo,
1217                                    upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1218                                    #else
1219                                    upTriFactorT->solveInfo));
1220                                    #endif
1221 
1222   CHKERRCUDA(WaitForCUDA());
1223   CHKERRQ(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
1224 
1225   /* assign the pointer */
1226   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1227   PetscFunctionReturn(0);
1228 }
1229 
1230 struct PetscScalarToPetscInt
1231 {
1232   __host__ __device__
1233   PetscInt operator()(PetscScalar s)
1234   {
1235     return (PetscInt)PetscRealPart(s);
1236   }
1237 };
1238 
1239 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A)
1240 {
1241   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1242   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1243   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1244   cusparseStatus_t             stat;
1245   cusparseIndexBase_t          indexBase;
1246 
1247   PetscFunctionBegin;
1248   CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(A));
1249   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1250   PetscCheckFalse(!matstruct,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing mat struct");
1251   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1252   PetscCheckFalse(A->transupdated && !matstructT,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing matTranspose struct");
1253   if (A->transupdated) PetscFunctionReturn(0);
1254   CHKERRQ(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
1255   CHKERRQ(PetscLogGpuTimeBegin());
1256   if (cusparsestruct->format != MAT_CUSPARSE_CSR) {
1257     CHKERRQ(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE));
1258   }
1259   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1260     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1261     CHKERRCUSPARSE(cusparseCreateMatDescr(&matstructT->descr));
1262     indexBase = cusparseGetMatIndexBase(matstruct->descr);
1263     CHKERRCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase));
1264     CHKERRCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
1265 
1266     /* set alpha and beta */
1267     CHKERRCUDA(cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar)));
1268     CHKERRCUDA(cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar)));
1269     CHKERRCUDA(cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar)));
1270     CHKERRCUDA(cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
1271     CHKERRCUDA(cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice));
1272     CHKERRCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
1273 
1274     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1275       CsrMatrix *matrixT = new CsrMatrix;
1276       matstructT->mat = matrixT;
1277       matrixT->num_rows = A->cmap->n;
1278       matrixT->num_cols = A->rmap->n;
1279       matrixT->num_entries = a->nz;
1280       matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1281       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1282       matrixT->values = new THRUSTARRAY(a->nz);
1283 
1284       if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); }
1285       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
1286 
1287      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1288       #if PETSC_PKG_CUDA_VERSION_GE(11,2,1)
1289         stat = cusparseCreateCsr(&matstructT->matDescr,
1290                                matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1291                                matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1292                                matrixT->values->data().get(),
1293                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1294                                indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
1295       #else
1296         /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1,
1297            see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1
1298 
1299            I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set
1300            it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2,
1301            when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly.
1302         */
1303         if (matrixT->num_entries) {
1304           stat = cusparseCreateCsr(&matstructT->matDescr,
1305                                  matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1306                                  matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1307                                  matrixT->values->data().get(),
1308                                  CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I,
1309                                  indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
1310 
1311         } else {
1312           matstructT->matDescr = NULL;
1313           matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
1314         }
1315       #endif
1316      #endif
1317     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1318    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1319       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1320    #else
1321       CsrMatrix *temp  = new CsrMatrix;
1322       CsrMatrix *tempT = new CsrMatrix;
1323       /* First convert HYB to CSR */
1324       temp->num_rows = A->rmap->n;
1325       temp->num_cols = A->cmap->n;
1326       temp->num_entries = a->nz;
1327       temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1328       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1329       temp->values = new THRUSTARRAY(a->nz);
1330 
1331       stat = cusparse_hyb2csr(cusparsestruct->handle,
1332                               matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1333                               temp->values->data().get(),
1334                               temp->row_offsets->data().get(),
1335                               temp->column_indices->data().get());CHKERRCUSPARSE(stat);
1336 
1337       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1338       tempT->num_rows = A->rmap->n;
1339       tempT->num_cols = A->cmap->n;
1340       tempT->num_entries = a->nz;
1341       tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1342       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1343       tempT->values = new THRUSTARRAY(a->nz);
1344 
1345       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1346                               temp->num_cols, temp->num_entries,
1347                               temp->values->data().get(),
1348                               temp->row_offsets->data().get(),
1349                               temp->column_indices->data().get(),
1350                               tempT->values->data().get(),
1351                               tempT->column_indices->data().get(),
1352                               tempT->row_offsets->data().get(),
1353                               CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1354 
1355       /* Last, convert CSC to HYB */
1356       cusparseHybMat_t hybMat;
1357       CHKERRCUSPARSE(cusparseCreateHybMat(&hybMat));
1358       cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1359         CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1360       stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1361                               matstructT->descr, tempT->values->data().get(),
1362                               tempT->row_offsets->data().get(),
1363                               tempT->column_indices->data().get(),
1364                               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1365 
1366       /* assign the pointer */
1367       matstructT->mat = hybMat;
1368       A->transupdated = PETSC_TRUE;
1369       /* delete temporaries */
1370       if (tempT) {
1371         if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1372         if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1373         if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1374         delete (CsrMatrix*) tempT;
1375       }
1376       if (temp) {
1377         if (temp->values) delete (THRUSTARRAY*) temp->values;
1378         if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1379         if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1380         delete (CsrMatrix*) temp;
1381       }
1382      #endif
1383     }
1384   }
1385   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1386     CsrMatrix *matrix  = (CsrMatrix*)matstruct->mat;
1387     CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat;
1388     PetscCheckFalse(!matrix,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix");
1389     PetscCheckFalse(!matrix->row_offsets,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix rows");
1390     PetscCheckFalse(!matrix->column_indices,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix cols");
1391     PetscCheckFalse(!matrix->values,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix values");
1392     PetscCheckFalse(!matrixT,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT");
1393     PetscCheckFalse(!matrixT->row_offsets,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT rows");
1394     PetscCheckFalse(!matrixT->column_indices,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT cols");
1395     PetscCheckFalse(!matrixT->values,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT values");
1396     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1397       cusparsestruct->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
1398       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
1399       CHKERRQ(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt)));
1400     }
1401     if (!cusparsestruct->csr2csc_i) {
1402       THRUSTARRAY csr2csc_a(matrix->num_entries);
1403       PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1404 
1405       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1406      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1407       void   *csr2cscBuffer;
1408       size_t csr2cscBufferSize;
1409       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1410                                            A->cmap->n, matrix->num_entries,
1411                                            matrix->values->data().get(),
1412                                            cusparsestruct->rowoffsets_gpu->data().get(),
1413                                            matrix->column_indices->data().get(),
1414                                            matrixT->values->data().get(),
1415                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1416                                            CUSPARSE_ACTION_NUMERIC,indexBase,
1417                                            cusparsestruct->csr2cscAlg, &csr2cscBufferSize);CHKERRCUSPARSE(stat);
1418       CHKERRCUDA(cudaMalloc(&csr2cscBuffer,csr2cscBufferSize));
1419      #endif
1420 
1421       if (matrix->num_entries) {
1422         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
1423            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
1424            I checked every parameters and they were just fine. I have no clue why cusparse complains.
1425 
1426            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
1427            should be filled with indexBase. So I just take a shortcut here.
1428         */
1429         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1430                               A->cmap->n,matrix->num_entries,
1431                               csr2csc_a.data().get(),
1432                               cusparsestruct->rowoffsets_gpu->data().get(),
1433                               matrix->column_indices->data().get(),
1434                               matrixT->values->data().get(),
1435                              #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1436                               matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1437                               CUSPARSE_ACTION_NUMERIC,indexBase,
1438                               cusparsestruct->csr2cscAlg, csr2cscBuffer);CHKERRCUSPARSE(stat);
1439                              #else
1440                               matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1441                               CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1442                              #endif
1443       } else {
1444         matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
1445       }
1446 
1447       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1448       PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt()));
1449      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1450       CHKERRCUDA(cudaFree(csr2cscBuffer));
1451      #endif
1452     }
1453     PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()),
1454                                                      thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()),
1455                                                      matrixT->values->begin()));
1456   }
1457   CHKERRQ(PetscLogGpuTimeEnd());
1458   CHKERRQ(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
1459   /* the compressed row indices is not used for matTranspose */
1460   matstructT->cprowIndices = NULL;
1461   /* assign the pointer */
1462   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1463   A->transupdated = PETSC_TRUE;
1464   PetscFunctionReturn(0);
1465 }
1466 
1467 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1468 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1469 {
1470   PetscInt                              n = xx->map->n;
1471   const PetscScalar                     *barray;
1472   PetscScalar                           *xarray;
1473   thrust::device_ptr<const PetscScalar> bGPU;
1474   thrust::device_ptr<PetscScalar>       xGPU;
1475   cusparseStatus_t                      stat;
1476   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1477   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1478   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1479   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1480 
1481   PetscFunctionBegin;
1482   /* Analyze the matrix and create the transpose ... on the fly */
1483   if (!loTriFactorT && !upTriFactorT) {
1484     CHKERRQ(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1485     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1486     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1487   }
1488 
1489   /* Get the GPU pointers */
1490   CHKERRQ(VecCUDAGetArrayWrite(xx,&xarray));
1491   CHKERRQ(VecCUDAGetArrayRead(bb,&barray));
1492   xGPU = thrust::device_pointer_cast(xarray);
1493   bGPU = thrust::device_pointer_cast(barray);
1494 
1495   CHKERRQ(PetscLogGpuTimeBegin());
1496   /* First, reorder with the row permutation */
1497   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1498                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1499                xGPU);
1500 
1501   /* First, solve U */
1502   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1503                         upTriFactorT->csrMat->num_rows,
1504                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1505                         upTriFactorT->csrMat->num_entries,
1506                       #endif
1507                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1508                         upTriFactorT->csrMat->values->data().get(),
1509                         upTriFactorT->csrMat->row_offsets->data().get(),
1510                         upTriFactorT->csrMat->column_indices->data().get(),
1511                         upTriFactorT->solveInfo,
1512                         xarray,
1513                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1514                         tempGPU->data().get(),
1515                         upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1516                       #else
1517                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1518                       #endif
1519 
1520   /* Then, solve L */
1521   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1522                         loTriFactorT->csrMat->num_rows,
1523                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1524                         loTriFactorT->csrMat->num_entries,
1525                       #endif
1526                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1527                         loTriFactorT->csrMat->values->data().get(),
1528                         loTriFactorT->csrMat->row_offsets->data().get(),
1529                         loTriFactorT->csrMat->column_indices->data().get(),
1530                         loTriFactorT->solveInfo,
1531                         tempGPU->data().get(),
1532                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1533                         xarray,
1534                         loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1535                       #else
1536                          xarray);CHKERRCUSPARSE(stat);
1537                       #endif
1538 
1539   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1540   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1541                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1542                tempGPU->begin());
1543 
1544   /* Copy the temporary to the full solution. */
1545   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU);
1546 
1547   /* restore */
1548   CHKERRQ(VecCUDARestoreArrayRead(bb,&barray));
1549   CHKERRQ(VecCUDARestoreArrayWrite(xx,&xarray));
1550   CHKERRQ(PetscLogGpuTimeEnd());
1551   CHKERRQ(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n));
1552   PetscFunctionReturn(0);
1553 }
1554 
1555 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1556 {
1557   const PetscScalar                 *barray;
1558   PetscScalar                       *xarray;
1559   cusparseStatus_t                  stat;
1560   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1561   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1562   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1563   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1564 
1565   PetscFunctionBegin;
1566   /* Analyze the matrix and create the transpose ... on the fly */
1567   if (!loTriFactorT && !upTriFactorT) {
1568     CHKERRQ(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1569     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1570     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1571   }
1572 
1573   /* Get the GPU pointers */
1574   CHKERRQ(VecCUDAGetArrayWrite(xx,&xarray));
1575   CHKERRQ(VecCUDAGetArrayRead(bb,&barray));
1576 
1577   CHKERRQ(PetscLogGpuTimeBegin());
1578   /* First, solve U */
1579   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1580                         upTriFactorT->csrMat->num_rows,
1581                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1582                         upTriFactorT->csrMat->num_entries,
1583                       #endif
1584                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1585                         upTriFactorT->csrMat->values->data().get(),
1586                         upTriFactorT->csrMat->row_offsets->data().get(),
1587                         upTriFactorT->csrMat->column_indices->data().get(),
1588                         upTriFactorT->solveInfo,
1589                         barray,
1590                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1591                         tempGPU->data().get(),
1592                         upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1593                       #else
1594                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1595                       #endif
1596 
1597   /* Then, solve L */
1598   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1599                         loTriFactorT->csrMat->num_rows,
1600                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1601                         loTriFactorT->csrMat->num_entries,
1602                       #endif
1603                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1604                         loTriFactorT->csrMat->values->data().get(),
1605                         loTriFactorT->csrMat->row_offsets->data().get(),
1606                         loTriFactorT->csrMat->column_indices->data().get(),
1607                         loTriFactorT->solveInfo,
1608                         tempGPU->data().get(),
1609                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1610                         xarray,
1611                         loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1612                       #else
1613                         xarray);CHKERRCUSPARSE(stat);
1614                       #endif
1615 
1616   /* restore */
1617   CHKERRQ(VecCUDARestoreArrayRead(bb,&barray));
1618   CHKERRQ(VecCUDARestoreArrayWrite(xx,&xarray));
1619   CHKERRQ(PetscLogGpuTimeEnd());
1620   CHKERRQ(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n));
1621   PetscFunctionReturn(0);
1622 }
1623 
1624 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1625 {
1626   const PetscScalar                     *barray;
1627   PetscScalar                           *xarray;
1628   thrust::device_ptr<const PetscScalar> bGPU;
1629   thrust::device_ptr<PetscScalar>       xGPU;
1630   cusparseStatus_t                      stat;
1631   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1632   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1633   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1634   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1635 
1636   PetscFunctionBegin;
1637 
1638   /* Get the GPU pointers */
1639   CHKERRQ(VecCUDAGetArrayWrite(xx,&xarray));
1640   CHKERRQ(VecCUDAGetArrayRead(bb,&barray));
1641   xGPU = thrust::device_pointer_cast(xarray);
1642   bGPU = thrust::device_pointer_cast(barray);
1643 
1644   CHKERRQ(PetscLogGpuTimeBegin());
1645   /* First, reorder with the row permutation */
1646   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1647                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1648                tempGPU->begin());
1649 
1650   /* Next, solve L */
1651   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1652                         loTriFactor->csrMat->num_rows,
1653                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1654                         loTriFactor->csrMat->num_entries,
1655                       #endif
1656                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1657                         loTriFactor->csrMat->values->data().get(),
1658                         loTriFactor->csrMat->row_offsets->data().get(),
1659                         loTriFactor->csrMat->column_indices->data().get(),
1660                         loTriFactor->solveInfo,
1661                         tempGPU->data().get(),
1662                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1663                          xarray,
1664                          loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1665                       #else
1666                          xarray);CHKERRCUSPARSE(stat);
1667                       #endif
1668 
1669   /* Then, solve U */
1670   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1671                         upTriFactor->csrMat->num_rows,
1672                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1673                         upTriFactor->csrMat->num_entries,
1674                       #endif
1675                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1676                         upTriFactor->csrMat->values->data().get(),
1677                         upTriFactor->csrMat->row_offsets->data().get(),
1678                         upTriFactor->csrMat->column_indices->data().get(),
1679                         upTriFactor->solveInfo,xarray,
1680                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1681                         tempGPU->data().get(),
1682                         upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1683                       #else
1684                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1685                       #endif
1686 
1687   /* Last, reorder with the column permutation */
1688   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1689                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
1690                xGPU);
1691 
1692   CHKERRQ(VecCUDARestoreArrayRead(bb,&barray));
1693   CHKERRQ(VecCUDARestoreArrayWrite(xx,&xarray));
1694   CHKERRQ(PetscLogGpuTimeEnd());
1695   CHKERRQ(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n));
1696   PetscFunctionReturn(0);
1697 }
1698 
1699 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1700 {
1701   const PetscScalar                 *barray;
1702   PetscScalar                       *xarray;
1703   cusparseStatus_t                  stat;
1704   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1705   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1706   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1707   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1708 
1709   PetscFunctionBegin;
1710   /* Get the GPU pointers */
1711   CHKERRQ(VecCUDAGetArrayWrite(xx,&xarray));
1712   CHKERRQ(VecCUDAGetArrayRead(bb,&barray));
1713 
1714   CHKERRQ(PetscLogGpuTimeBegin());
1715   /* First, solve L */
1716   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1717                         loTriFactor->csrMat->num_rows,
1718                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1719                         loTriFactor->csrMat->num_entries,
1720                       #endif
1721                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1722                         loTriFactor->csrMat->values->data().get(),
1723                         loTriFactor->csrMat->row_offsets->data().get(),
1724                         loTriFactor->csrMat->column_indices->data().get(),
1725                         loTriFactor->solveInfo,
1726                         barray,
1727                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1728                         tempGPU->data().get(),
1729                         loTriFactor->solvePolicy,loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1730                       #else
1731                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1732                       #endif
1733 
1734   /* Next, solve U */
1735   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1736                         upTriFactor->csrMat->num_rows,
1737                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1738                         upTriFactor->csrMat->num_entries,
1739                       #endif
1740                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1741                         upTriFactor->csrMat->values->data().get(),
1742                         upTriFactor->csrMat->row_offsets->data().get(),
1743                         upTriFactor->csrMat->column_indices->data().get(),
1744                         upTriFactor->solveInfo,
1745                         tempGPU->data().get(),
1746                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1747                         xarray,
1748                         upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1749                       #else
1750                         xarray);CHKERRCUSPARSE(stat);
1751                       #endif
1752 
1753   CHKERRQ(VecCUDARestoreArrayRead(bb,&barray));
1754   CHKERRQ(VecCUDARestoreArrayWrite(xx,&xarray));
1755   CHKERRQ(PetscLogGpuTimeEnd());
1756   CHKERRQ(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n));
1757   PetscFunctionReturn(0);
1758 }
1759 
1760 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
1761 {
1762   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1763   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1764 
1765   PetscFunctionBegin;
1766   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
1767     CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat;
1768 
1769     CHKERRQ(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0));
1770     CHKERRCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost));
1771     CHKERRCUDA(WaitForCUDA());
1772     CHKERRQ(PetscLogGpuToCpu(a->nz*sizeof(PetscScalar)));
1773     CHKERRQ(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0));
1774     A->offloadmask = PETSC_OFFLOAD_BOTH;
1775   }
1776   PetscFunctionReturn(0);
1777 }
1778 
1779 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1780 {
1781   PetscFunctionBegin;
1782   CHKERRQ(MatSeqAIJCUSPARSECopyFromGPU(A));
1783   *array = ((Mat_SeqAIJ*)A->data)->a;
1784   PetscFunctionReturn(0);
1785 }
1786 
1787 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1788 {
1789   PetscFunctionBegin;
1790   A->offloadmask = PETSC_OFFLOAD_CPU;
1791   *array         = NULL;
1792   PetscFunctionReturn(0);
1793 }
1794 
1795 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[])
1796 {
1797   PetscFunctionBegin;
1798   CHKERRQ(MatSeqAIJCUSPARSECopyFromGPU(A));
1799   *array = ((Mat_SeqAIJ*)A->data)->a;
1800   PetscFunctionReturn(0);
1801 }
1802 
1803 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[])
1804 {
1805   PetscFunctionBegin;
1806   *array = NULL;
1807   PetscFunctionReturn(0);
1808 }
1809 
1810 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1811 {
1812   PetscFunctionBegin;
1813   *array = ((Mat_SeqAIJ*)A->data)->a;
1814   PetscFunctionReturn(0);
1815 }
1816 
1817 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1818 {
1819   PetscFunctionBegin;
1820   A->offloadmask = PETSC_OFFLOAD_CPU;
1821   *array         = NULL;
1822   PetscFunctionReturn(0);
1823 }
1824 
1825 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1826 {
1827   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1828   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
1829   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1830   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
1831   cusparseStatus_t             stat;
1832   PetscBool                    both = PETSC_TRUE;
1833 
1834   PetscFunctionBegin;
1835   PetscCheckFalse(A->boundtocpu,PETSC_COMM_SELF,PETSC_ERR_GPU,"Cannot copy to GPU");
1836   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1837     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
1838       CsrMatrix *matrix;
1839       matrix = (CsrMatrix*)cusparsestruct->mat->mat;
1840 
1841       PetscCheckFalse(a->nz && !a->a,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR values");
1842       CHKERRQ(PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0));
1843       matrix->values->assign(a->a, a->a+a->nz);
1844       CHKERRCUDA(WaitForCUDA());
1845       CHKERRQ(PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar)));
1846       CHKERRQ(PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0));
1847       CHKERRQ(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE));
1848     } else {
1849       PetscInt nnz;
1850       CHKERRQ(PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0));
1851       CHKERRQ(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format));
1852       CHKERRQ(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE));
1853       delete cusparsestruct->workVector;
1854       delete cusparsestruct->rowoffsets_gpu;
1855       cusparsestruct->workVector = NULL;
1856       cusparsestruct->rowoffsets_gpu = NULL;
1857       try {
1858         if (a->compressedrow.use) {
1859           m    = a->compressedrow.nrows;
1860           ii   = a->compressedrow.i;
1861           ridx = a->compressedrow.rindex;
1862         } else {
1863           m    = A->rmap->n;
1864           ii   = a->i;
1865           ridx = NULL;
1866         }
1867         PetscCheckFalse(!ii,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR row data");
1868         if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; }
1869         else nnz = a->nz;
1870         PetscCheckFalse(nnz && !a->j,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR column data");
1871 
1872         /* create cusparse matrix */
1873         cusparsestruct->nrows = m;
1874         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1875         CHKERRCUSPARSE(cusparseCreateMatDescr(&matstruct->descr));
1876         CHKERRCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO));
1877         CHKERRCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
1878 
1879         CHKERRCUDA(cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar)));
1880         CHKERRCUDA(cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar)));
1881         CHKERRCUDA(cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar)));
1882         CHKERRCUDA(cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
1883         CHKERRCUDA(cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice));
1884         CHKERRCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
1885         CHKERRCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE));
1886 
1887         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1888         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1889           /* set the matrix */
1890           CsrMatrix *mat= new CsrMatrix;
1891           mat->num_rows = m;
1892           mat->num_cols = A->cmap->n;
1893           mat->num_entries = nnz;
1894           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1895           mat->row_offsets->assign(ii, ii + m+1);
1896 
1897           mat->column_indices = new THRUSTINTARRAY32(nnz);
1898           mat->column_indices->assign(a->j, a->j+nnz);
1899 
1900           mat->values = new THRUSTARRAY(nnz);
1901           if (a->a) mat->values->assign(a->a, a->a+nnz);
1902 
1903           /* assign the pointer */
1904           matstruct->mat = mat;
1905          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1906           if (mat->num_rows) { /* cusparse errors on empty matrices! */
1907             stat = cusparseCreateCsr(&matstruct->matDescr,
1908                                     mat->num_rows, mat->num_cols, mat->num_entries,
1909                                     mat->row_offsets->data().get(), mat->column_indices->data().get(),
1910                                     mat->values->data().get(),
1911                                     CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1912                                     CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
1913           }
1914          #endif
1915         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1916          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1917           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1918          #else
1919           CsrMatrix *mat= new CsrMatrix;
1920           mat->num_rows = m;
1921           mat->num_cols = A->cmap->n;
1922           mat->num_entries = nnz;
1923           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1924           mat->row_offsets->assign(ii, ii + m+1);
1925 
1926           mat->column_indices = new THRUSTINTARRAY32(nnz);
1927           mat->column_indices->assign(a->j, a->j+nnz);
1928 
1929           mat->values = new THRUSTARRAY(nnz);
1930           if (a->a) mat->values->assign(a->a, a->a+nnz);
1931 
1932           cusparseHybMat_t hybMat;
1933           CHKERRCUSPARSE(cusparseCreateHybMat(&hybMat));
1934           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1935             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1936           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
1937               matstruct->descr, mat->values->data().get(),
1938               mat->row_offsets->data().get(),
1939               mat->column_indices->data().get(),
1940               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1941           /* assign the pointer */
1942           matstruct->mat = hybMat;
1943 
1944           if (mat) {
1945             if (mat->values) delete (THRUSTARRAY*)mat->values;
1946             if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1947             if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1948             delete (CsrMatrix*)mat;
1949           }
1950          #endif
1951         }
1952 
1953         /* assign the compressed row indices */
1954         if (a->compressedrow.use) {
1955           cusparsestruct->workVector = new THRUSTARRAY(m);
1956           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
1957           matstruct->cprowIndices->assign(ridx,ridx+m);
1958           tmp = m;
1959         } else {
1960           cusparsestruct->workVector = NULL;
1961           matstruct->cprowIndices    = NULL;
1962           tmp = 0;
1963         }
1964         CHKERRQ(PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar)));
1965 
1966         /* assign the pointer */
1967         cusparsestruct->mat = matstruct;
1968       } catch(char *ex) {
1969         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1970       }
1971       CHKERRCUDA(WaitForCUDA());
1972       CHKERRQ(PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0));
1973       cusparsestruct->nonzerostate = A->nonzerostate;
1974     }
1975     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
1976   }
1977   PetscFunctionReturn(0);
1978 }
1979 
1980 struct VecCUDAPlusEquals
1981 {
1982   template <typename Tuple>
1983   __host__ __device__
1984   void operator()(Tuple t)
1985   {
1986     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1987   }
1988 };
1989 
1990 struct VecCUDAEquals
1991 {
1992   template <typename Tuple>
1993   __host__ __device__
1994   void operator()(Tuple t)
1995   {
1996     thrust::get<1>(t) = thrust::get<0>(t);
1997   }
1998 };
1999 
2000 struct VecCUDAEqualsReverse
2001 {
2002   template <typename Tuple>
2003   __host__ __device__
2004   void operator()(Tuple t)
2005   {
2006     thrust::get<0>(t) = thrust::get<1>(t);
2007   }
2008 };
2009 
2010 struct MatMatCusparse {
2011   PetscBool             cisdense;
2012   PetscScalar           *Bt;
2013   Mat                   X;
2014   PetscBool             reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
2015   PetscLogDouble        flops;
2016   CsrMatrix             *Bcsr;
2017 
2018 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2019   cusparseSpMatDescr_t  matSpBDescr;
2020   PetscBool             initialized;   /* C = alpha op(A) op(B) + beta C */
2021   cusparseDnMatDescr_t  matBDescr;
2022   cusparseDnMatDescr_t  matCDescr;
2023   PetscInt              Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
2024  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2025   void                  *dBuffer4;
2026   void                  *dBuffer5;
2027  #endif
2028   size_t                mmBufferSize;
2029   void                  *mmBuffer;
2030   void                  *mmBuffer2; /* SpGEMM WorkEstimation buffer */
2031   cusparseSpGEMMDescr_t spgemmDesc;
2032 #endif
2033 };
2034 
2035 static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
2036 {
2037   MatMatCusparse *mmdata = (MatMatCusparse *)data;
2038 
2039   PetscFunctionBegin;
2040   CHKERRCUDA(cudaFree(mmdata->Bt));
2041   delete mmdata->Bcsr;
2042  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2043   if (mmdata->matSpBDescr) CHKERRCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr));
2044   if (mmdata->matBDescr)   CHKERRCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
2045   if (mmdata->matCDescr)   CHKERRCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
2046   if (mmdata->spgemmDesc)  CHKERRCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc));
2047  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2048   if (mmdata->dBuffer4)  CHKERRCUDA(cudaFree(mmdata->dBuffer4));
2049   if (mmdata->dBuffer5)  CHKERRCUDA(cudaFree(mmdata->dBuffer5));
2050  #endif
2051   if (mmdata->mmBuffer)  CHKERRCUDA(cudaFree(mmdata->mmBuffer));
2052   if (mmdata->mmBuffer2) CHKERRCUDA(cudaFree(mmdata->mmBuffer2));
2053  #endif
2054   CHKERRQ(MatDestroy(&mmdata->X));
2055   CHKERRQ(PetscFree(data));
2056   PetscFunctionReturn(0);
2057 }
2058 
2059 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
2060 
2061 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2062 {
2063   Mat_Product                  *product = C->product;
2064   Mat                          A,B;
2065   PetscInt                     m,n,blda,clda;
2066   PetscBool                    flg,biscuda;
2067   Mat_SeqAIJCUSPARSE           *cusp;
2068   cusparseStatus_t             stat;
2069   cusparseOperation_t          opA;
2070   const PetscScalar            *barray;
2071   PetscScalar                  *carray;
2072   MatMatCusparse               *mmdata;
2073   Mat_SeqAIJCUSPARSEMultStruct *mat;
2074   CsrMatrix                    *csrmat;
2075 
2076   PetscFunctionBegin;
2077   MatCheckProduct(C,1);
2078   PetscCheckFalse(!C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
2079   mmdata = (MatMatCusparse*)product->data;
2080   A    = product->A;
2081   B    = product->B;
2082   CHKERRQ(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
2083   PetscCheckFalse(!flg,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2084   /* currently CopyToGpu does not copy if the matrix is bound to CPU
2085      Instead of silently accepting the wrong answer, I prefer to raise the error */
2086   PetscCheckFalse(A->boundtocpu,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2087   CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(A));
2088   cusp   = (Mat_SeqAIJCUSPARSE*)A->spptr;
2089   switch (product->type) {
2090   case MATPRODUCT_AB:
2091   case MATPRODUCT_PtAP:
2092     mat = cusp->mat;
2093     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2094     m   = A->rmap->n;
2095     n   = B->cmap->n;
2096     break;
2097   case MATPRODUCT_AtB:
2098     if (!A->form_explicit_transpose) {
2099       mat = cusp->mat;
2100       opA = CUSPARSE_OPERATION_TRANSPOSE;
2101     } else {
2102       CHKERRQ(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2103       mat  = cusp->matTranspose;
2104       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
2105     }
2106     m = A->cmap->n;
2107     n = B->cmap->n;
2108     break;
2109   case MATPRODUCT_ABt:
2110   case MATPRODUCT_RARt:
2111     mat = cusp->mat;
2112     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2113     m   = A->rmap->n;
2114     n   = B->rmap->n;
2115     break;
2116   default:
2117     SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2118   }
2119   PetscCheckFalse(!mat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing Mat_SeqAIJCUSPARSEMultStruct");
2120   csrmat = (CsrMatrix*)mat->mat;
2121   /* if the user passed a CPU matrix, copy the data to the GPU */
2122   CHKERRQ(PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda));
2123   if (!biscuda) CHKERRQ(MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B));
2124   CHKERRQ(MatDenseCUDAGetArrayRead(B,&barray));
2125 
2126   CHKERRQ(MatDenseGetLDA(B,&blda));
2127   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2128     CHKERRQ(MatDenseCUDAGetArrayWrite(mmdata->X,&carray));
2129     CHKERRQ(MatDenseGetLDA(mmdata->X,&clda));
2130   } else {
2131     CHKERRQ(MatDenseCUDAGetArrayWrite(C,&carray));
2132     CHKERRQ(MatDenseGetLDA(C,&clda));
2133   }
2134 
2135   CHKERRQ(PetscLogGpuTimeBegin());
2136  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2137   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2138   /* (re)allocate mmBuffer if not initialized or LDAs are different */
2139   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2140     size_t mmBufferSize;
2141     if (mmdata->initialized && mmdata->Blda != blda) {CHKERRCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); mmdata->matBDescr = NULL;}
2142     if (!mmdata->matBDescr) {
2143       CHKERRCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL));
2144       mmdata->Blda = blda;
2145     }
2146 
2147     if (mmdata->initialized && mmdata->Clda != clda) {CHKERRCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); mmdata->matCDescr = NULL;}
2148     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2149       CHKERRCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL));
2150       mmdata->Clda = clda;
2151     }
2152 
2153     if (!mat->matDescr) {
2154       stat = cusparseCreateCsr(&mat->matDescr,
2155                                csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
2156                                csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
2157                                csrmat->values->data().get(),
2158                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2159                                CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
2160     }
2161     stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
2162                                    mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2163                                    mmdata->matCDescr,cusparse_scalartype,
2164                                    cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat);
2165     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2166       CHKERRCUDA(cudaFree(mmdata->mmBuffer));
2167       CHKERRCUDA(cudaMalloc(&mmdata->mmBuffer,mmBufferSize));
2168       mmdata->mmBufferSize = mmBufferSize;
2169     }
2170     mmdata->initialized = PETSC_TRUE;
2171   } else {
2172     /* to be safe, always update pointers of the mats */
2173     CHKERRCUSPARSE(cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get()));
2174     CHKERRCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray));
2175     CHKERRCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray));
2176   }
2177 
2178   /* do cusparseSpMM, which supports transpose on B */
2179   stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
2180                       mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2181                       mmdata->matCDescr,cusparse_scalartype,
2182                       cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2183  #else
2184   PetscInt k;
2185   /* cusparseXcsrmm does not support transpose on B */
2186   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2187     cublasHandle_t cublasv2handle;
2188     cublasStatus_t cerr;
2189 
2190     CHKERRQ(PetscCUBLASGetHandle(&cublasv2handle));
2191     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
2192                        B->cmap->n,B->rmap->n,
2193                        &PETSC_CUSPARSE_ONE ,barray,blda,
2194                        &PETSC_CUSPARSE_ZERO,barray,blda,
2195                        mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr);
2196     blda = B->cmap->n;
2197     k    = B->cmap->n;
2198   } else {
2199     k    = B->rmap->n;
2200   }
2201 
2202   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2203   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
2204                            csrmat->num_entries,mat->alpha_one,mat->descr,
2205                            csrmat->values->data().get(),
2206                            csrmat->row_offsets->data().get(),
2207                            csrmat->column_indices->data().get(),
2208                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
2209                            carray,clda);CHKERRCUSPARSE(stat);
2210  #endif
2211   CHKERRQ(PetscLogGpuTimeEnd());
2212   CHKERRQ(PetscLogGpuFlops(n*2.0*csrmat->num_entries));
2213   CHKERRQ(MatDenseCUDARestoreArrayRead(B,&barray));
2214   if (product->type == MATPRODUCT_RARt) {
2215     CHKERRQ(MatDenseCUDARestoreArrayWrite(mmdata->X,&carray));
2216     CHKERRQ(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE));
2217   } else if (product->type == MATPRODUCT_PtAP) {
2218     CHKERRQ(MatDenseCUDARestoreArrayWrite(mmdata->X,&carray));
2219     CHKERRQ(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE));
2220   } else {
2221     CHKERRQ(MatDenseCUDARestoreArrayWrite(C,&carray));
2222   }
2223   if (mmdata->cisdense) {
2224     CHKERRQ(MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C));
2225   }
2226   if (!biscuda) {
2227     CHKERRQ(MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B));
2228   }
2229   PetscFunctionReturn(0);
2230 }
2231 
2232 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2233 {
2234   Mat_Product        *product = C->product;
2235   Mat                A,B;
2236   PetscInt           m,n;
2237   PetscBool          cisdense,flg;
2238   MatMatCusparse     *mmdata;
2239   Mat_SeqAIJCUSPARSE *cusp;
2240 
2241   PetscFunctionBegin;
2242   MatCheckProduct(C,1);
2243   PetscCheckFalse(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
2244   A    = product->A;
2245   B    = product->B;
2246   CHKERRQ(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
2247   PetscCheckFalse(!flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2248   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2249   PetscCheckFalse(cusp->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2250   switch (product->type) {
2251   case MATPRODUCT_AB:
2252     m = A->rmap->n;
2253     n = B->cmap->n;
2254     break;
2255   case MATPRODUCT_AtB:
2256     m = A->cmap->n;
2257     n = B->cmap->n;
2258     break;
2259   case MATPRODUCT_ABt:
2260     m = A->rmap->n;
2261     n = B->rmap->n;
2262     break;
2263   case MATPRODUCT_PtAP:
2264     m = B->cmap->n;
2265     n = B->cmap->n;
2266     break;
2267   case MATPRODUCT_RARt:
2268     m = B->rmap->n;
2269     n = B->rmap->n;
2270     break;
2271   default:
2272     SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2273   }
2274   CHKERRQ(MatSetSizes(C,m,n,m,n));
2275   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2276   CHKERRQ(PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense));
2277   CHKERRQ(MatSetType(C,MATSEQDENSECUDA));
2278 
2279   /* product data */
2280   CHKERRQ(PetscNew(&mmdata));
2281   mmdata->cisdense = cisdense;
2282  #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
2283   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2284   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2285     CHKERRCUDA(cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar)));
2286   }
2287  #endif
2288   /* for these products we need intermediate storage */
2289   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2290     CHKERRQ(MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X));
2291     CHKERRQ(MatSetType(mmdata->X,MATSEQDENSECUDA));
2292     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2293       CHKERRQ(MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n));
2294     } else {
2295       CHKERRQ(MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n));
2296     }
2297   }
2298   C->product->data    = mmdata;
2299   C->product->destroy = MatDestroy_MatMatCusparse;
2300 
2301   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2302   PetscFunctionReturn(0);
2303 }
2304 
2305 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2306 {
2307   Mat_Product                  *product = C->product;
2308   Mat                          A,B;
2309   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2310   Mat_SeqAIJ                   *c = (Mat_SeqAIJ*)C->data;
2311   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2312   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2313   PetscBool                    flg;
2314   cusparseStatus_t             stat;
2315   MatProductType               ptype;
2316   MatMatCusparse               *mmdata;
2317 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2318   cusparseSpMatDescr_t         BmatSpDescr;
2319 #endif
2320   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2321 
2322   PetscFunctionBegin;
2323   MatCheckProduct(C,1);
2324   PetscCheckFalse(!C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
2325   CHKERRQ(PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg));
2326   PetscCheckFalse(!flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for C of type %s",((PetscObject)C)->type_name);
2327   mmdata = (MatMatCusparse*)C->product->data;
2328   A = product->A;
2329   B = product->B;
2330   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2331     mmdata->reusesym = PETSC_FALSE;
2332     Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2333     PetscCheckFalse(Ccusp->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2334     Cmat = Ccusp->mat;
2335     PetscCheckFalse(!Cmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]);
2336     Ccsr = (CsrMatrix*)Cmat->mat;
2337     PetscCheckFalse(!Ccsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
2338     goto finalize;
2339   }
2340   if (!c->nz) goto finalize;
2341   CHKERRQ(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
2342   PetscCheckFalse(!flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2343   CHKERRQ(PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg));
2344   PetscCheckFalse(!flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
2345   PetscCheckFalse(A->boundtocpu,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2346   PetscCheckFalse(B->boundtocpu,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2347   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2348   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2349   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2350   PetscCheckFalse(Acusp->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2351   PetscCheckFalse(Bcusp->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2352   PetscCheckFalse(Ccusp->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2353   CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(A));
2354   CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(B));
2355 
2356   ptype = product->type;
2357   if (A->symmetric && ptype == MATPRODUCT_AtB) {
2358     ptype = MATPRODUCT_AB;
2359     PetscCheckFalse(!product->symbolic_used_the_fact_A_is_symmetric,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that A is symmetric");
2360   }
2361   if (B->symmetric && ptype == MATPRODUCT_ABt) {
2362     ptype = MATPRODUCT_AB;
2363     PetscCheckFalse(!product->symbolic_used_the_fact_B_is_symmetric,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that B is symmetric");
2364   }
2365   switch (ptype) {
2366   case MATPRODUCT_AB:
2367     Amat = Acusp->mat;
2368     Bmat = Bcusp->mat;
2369     break;
2370   case MATPRODUCT_AtB:
2371     Amat = Acusp->matTranspose;
2372     Bmat = Bcusp->mat;
2373     break;
2374   case MATPRODUCT_ABt:
2375     Amat = Acusp->mat;
2376     Bmat = Bcusp->matTranspose;
2377     break;
2378   default:
2379     SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2380   }
2381   Cmat = Ccusp->mat;
2382   PetscCheckFalse(!Amat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2383   PetscCheckFalse(!Bmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2384   PetscCheckFalse(!Cmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[ptype]);
2385   Acsr = (CsrMatrix*)Amat->mat;
2386   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */
2387   Ccsr = (CsrMatrix*)Cmat->mat;
2388   PetscCheckFalse(!Acsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
2389   PetscCheckFalse(!Bcsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
2390   PetscCheckFalse(!Ccsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
2391   CHKERRQ(PetscLogGpuTimeBegin());
2392 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2393   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2394   CHKERRCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2395   #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2396     stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB,
2397                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2398                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2399                                mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2400   #else
2401     stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2402                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2403                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2404                                mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2405     stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB,
2406                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2407                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2408   #endif
2409 #else
2410   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB,
2411                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2412                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2413                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2414                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2415 #endif
2416   CHKERRQ(PetscLogGpuFlops(mmdata->flops));
2417   CHKERRCUDA(WaitForCUDA());
2418   CHKERRQ(PetscLogGpuTimeEnd());
2419   C->offloadmask = PETSC_OFFLOAD_GPU;
2420 finalize:
2421   /* shorter version of MatAssemblyEnd_SeqAIJ */
2422   CHKERRQ(PetscInfo(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz));
2423   CHKERRQ(PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n"));
2424   CHKERRQ(PetscInfo(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax));
2425   c->reallocs         = 0;
2426   C->info.mallocs    += 0;
2427   C->info.nz_unneeded = 0;
2428   C->assembled = C->was_assembled = PETSC_TRUE;
2429   C->num_ass++;
2430   PetscFunctionReturn(0);
2431 }
2432 
2433 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2434 {
2435   Mat_Product                  *product = C->product;
2436   Mat                          A,B;
2437   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2438   Mat_SeqAIJ                   *a,*b,*c;
2439   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2440   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2441   PetscInt                     i,j,m,n,k;
2442   PetscBool                    flg;
2443   cusparseStatus_t             stat;
2444   MatProductType               ptype;
2445   MatMatCusparse               *mmdata;
2446   PetscLogDouble               flops;
2447   PetscBool                    biscompressed,ciscompressed;
2448 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2449   int64_t                      C_num_rows1, C_num_cols1, C_nnz1;
2450   cusparseSpMatDescr_t         BmatSpDescr;
2451 #else
2452   int                          cnz;
2453 #endif
2454   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2455 
2456   PetscFunctionBegin;
2457   MatCheckProduct(C,1);
2458   PetscCheckFalse(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
2459   A    = product->A;
2460   B    = product->B;
2461   CHKERRQ(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
2462   PetscCheckFalse(!flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2463   CHKERRQ(PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg));
2464   PetscCheckFalse(!flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
2465   a = (Mat_SeqAIJ*)A->data;
2466   b = (Mat_SeqAIJ*)B->data;
2467   /* product data */
2468   CHKERRQ(PetscNew(&mmdata));
2469   C->product->data    = mmdata;
2470   C->product->destroy = MatDestroy_MatMatCusparse;
2471 
2472   CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(A));
2473   CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(B));
2474   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */
2475   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2476   PetscCheckFalse(Acusp->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2477   PetscCheckFalse(Bcusp->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2478 
2479   ptype = product->type;
2480   if (A->symmetric && ptype == MATPRODUCT_AtB) {
2481     ptype = MATPRODUCT_AB;
2482     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
2483   }
2484   if (B->symmetric && ptype == MATPRODUCT_ABt) {
2485     ptype = MATPRODUCT_AB;
2486     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
2487   }
2488   biscompressed = PETSC_FALSE;
2489   ciscompressed = PETSC_FALSE;
2490   switch (ptype) {
2491   case MATPRODUCT_AB:
2492     m = A->rmap->n;
2493     n = B->cmap->n;
2494     k = A->cmap->n;
2495     Amat = Acusp->mat;
2496     Bmat = Bcusp->mat;
2497     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2498     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2499     break;
2500   case MATPRODUCT_AtB:
2501     m = A->cmap->n;
2502     n = B->cmap->n;
2503     k = A->rmap->n;
2504     CHKERRQ(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2505     Amat = Acusp->matTranspose;
2506     Bmat = Bcusp->mat;
2507     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2508     break;
2509   case MATPRODUCT_ABt:
2510     m = A->rmap->n;
2511     n = B->rmap->n;
2512     k = A->cmap->n;
2513     CHKERRQ(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
2514     Amat = Acusp->mat;
2515     Bmat = Bcusp->matTranspose;
2516     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2517     break;
2518   default:
2519     SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2520   }
2521 
2522   /* create cusparse matrix */
2523   CHKERRQ(MatSetSizes(C,m,n,m,n));
2524   CHKERRQ(MatSetType(C,MATSEQAIJCUSPARSE));
2525   c     = (Mat_SeqAIJ*)C->data;
2526   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2527   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
2528   Ccsr  = new CsrMatrix;
2529 
2530   c->compressedrow.use = ciscompressed;
2531   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2532     c->compressedrow.nrows = a->compressedrow.nrows;
2533     CHKERRQ(PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex));
2534     CHKERRQ(PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows));
2535     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
2536     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2537     Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows);
2538   } else {
2539     c->compressedrow.nrows  = 0;
2540     c->compressedrow.i      = NULL;
2541     c->compressedrow.rindex = NULL;
2542     Ccusp->workVector       = NULL;
2543     Cmat->cprowIndices      = NULL;
2544   }
2545   Ccusp->nrows    = ciscompressed ? c->compressedrow.nrows : m;
2546   Ccusp->mat      = Cmat;
2547   Ccusp->mat->mat = Ccsr;
2548   Ccsr->num_rows    = Ccusp->nrows;
2549   Ccsr->num_cols    = n;
2550   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1);
2551   CHKERRCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
2552   CHKERRCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
2553   CHKERRCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
2554   CHKERRCUDA(cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar)));
2555   CHKERRCUDA(cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar)));
2556   CHKERRCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar)));
2557   CHKERRCUDA(cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
2558   CHKERRCUDA(cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice));
2559   CHKERRCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
2560   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
2561     thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0);
2562     c->nz = 0;
2563     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2564     Ccsr->values = new THRUSTARRAY(c->nz);
2565     goto finalizesym;
2566   }
2567 
2568   PetscCheckFalse(!Amat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2569   PetscCheckFalse(!Bmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2570   Acsr = (CsrMatrix*)Amat->mat;
2571   if (!biscompressed) {
2572     Bcsr = (CsrMatrix*)Bmat->mat;
2573 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2574     BmatSpDescr = Bmat->matDescr;
2575 #endif
2576   } else { /* we need to use row offsets for the full matrix */
2577     CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat;
2578     Bcsr = new CsrMatrix;
2579     Bcsr->num_rows       = B->rmap->n;
2580     Bcsr->num_cols       = cBcsr->num_cols;
2581     Bcsr->num_entries    = cBcsr->num_entries;
2582     Bcsr->column_indices = cBcsr->column_indices;
2583     Bcsr->values         = cBcsr->values;
2584     if (!Bcusp->rowoffsets_gpu) {
2585       Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
2586       Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
2587       CHKERRQ(PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt)));
2588     }
2589     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2590     mmdata->Bcsr = Bcsr;
2591 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2592     if (Bcsr->num_rows && Bcsr->num_cols) {
2593       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries,
2594                                Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2595                                Bcsr->values->data().get(),
2596                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2597                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2598     }
2599     BmatSpDescr = mmdata->matSpBDescr;
2600 #endif
2601   }
2602   PetscCheckFalse(!Acsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
2603   PetscCheckFalse(!Bcsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
2604   /* precompute flops count */
2605   if (ptype == MATPRODUCT_AB) {
2606     for (i=0, flops = 0; i<A->rmap->n; i++) {
2607       const PetscInt st = a->i[i];
2608       const PetscInt en = a->i[i+1];
2609       for (j=st; j<en; j++) {
2610         const PetscInt brow = a->j[j];
2611         flops += 2.*(b->i[brow+1] - b->i[brow]);
2612       }
2613     }
2614   } else if (ptype == MATPRODUCT_AtB) {
2615     for (i=0, flops = 0; i<A->rmap->n; i++) {
2616       const PetscInt anzi = a->i[i+1] - a->i[i];
2617       const PetscInt bnzi = b->i[i+1] - b->i[i];
2618       flops += (2.*anzi)*bnzi;
2619     }
2620   } else { /* TODO */
2621     flops = 0.;
2622   }
2623 
2624   mmdata->flops = flops;
2625   CHKERRQ(PetscLogGpuTimeBegin());
2626 
2627 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2628   CHKERRCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2629   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0,
2630                           NULL, NULL, NULL,
2631                           CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2632                           CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2633   CHKERRCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc));
2634  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2635  {
2636   /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
2637      We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
2638   */
2639   void*  dBuffer1 = NULL;
2640   void*  dBuffer2 = NULL;
2641   void*  dBuffer3 = NULL;
2642   /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
2643   size_t bufferSize1 = 0;
2644   size_t bufferSize2 = 0;
2645   size_t bufferSize3 = 0;
2646   size_t bufferSize4 = 0;
2647   size_t bufferSize5 = 0;
2648 
2649   /*----------------------------------------------------------------------*/
2650   /* ask bufferSize1 bytes for external memory */
2651   stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2652                                             CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2653                                             &bufferSize1, NULL);CHKERRCUSPARSE(stat);
2654   CHKERRCUDA(cudaMalloc((void**) &dBuffer1, bufferSize1));
2655   /* inspect the matrices A and B to understand the memory requirement for the next step */
2656   stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2657                                             CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2658                                             &bufferSize1, dBuffer1);CHKERRCUSPARSE(stat);
2659 
2660   /*----------------------------------------------------------------------*/
2661   stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2662                                  CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2663                                  &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);CHKERRCUSPARSE(stat);
2664   CHKERRCUDA(cudaMalloc((void**) &dBuffer2, bufferSize2));
2665   CHKERRCUDA(cudaMalloc((void**) &dBuffer3, bufferSize3));
2666   CHKERRCUDA(cudaMalloc((void**) &mmdata->dBuffer4, bufferSize4));
2667   stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2668                                  CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2669                                  &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);CHKERRCUSPARSE(stat);
2670   CHKERRCUDA(cudaFree(dBuffer1));
2671   CHKERRCUDA(cudaFree(dBuffer2));
2672 
2673   /*----------------------------------------------------------------------*/
2674   /* get matrix C non-zero entries C_nnz1 */
2675   CHKERRCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
2676   c->nz = (PetscInt) C_nnz1;
2677   /* allocate matrix C */
2678   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2679   Ccsr->values         = new THRUSTARRAY(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2680   /* update matC with the new pointers */
2681   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2682                                 Ccsr->values->data().get());CHKERRCUSPARSE(stat);
2683 
2684   /*----------------------------------------------------------------------*/
2685   stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2686                                   CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2687                                   &bufferSize5, NULL);CHKERRCUSPARSE(stat);
2688   CHKERRCUDA(cudaMalloc((void**) &mmdata->dBuffer5, bufferSize5));
2689   stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2690                                   CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2691                                   &bufferSize5, mmdata->dBuffer5);CHKERRCUSPARSE(stat);
2692   CHKERRCUDA(cudaFree(dBuffer3));
2693   stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB,
2694                                      Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2695                                      cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2696                                      mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2697   CHKERRQ(PetscInfo(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufferSize4/1024,bufferSize5/1024));
2698  }
2699  #else
2700   size_t bufSize2;
2701   /* ask bufferSize bytes for external memory */
2702   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB,
2703                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2704                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2705                                        mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat);
2706   CHKERRCUDA(cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2));
2707   /* inspect the matrices A and B to understand the memory requirement for the next step */
2708   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB,
2709                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2710                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2711                                        mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat);
2712   /* ask bufferSize again bytes for external memory */
2713   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2714                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2715                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2716                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat);
2717   /* The CUSPARSE documentation is not clear, nor the API
2718      We need both buffers to perform the operations properly!
2719      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2720      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2721      is stored in the descriptor! What a messy API... */
2722   CHKERRCUDA(cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize));
2723   /* compute the intermediate product of A * B */
2724   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2725                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2726                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2727                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2728   /* get matrix C non-zero entries C_nnz1 */
2729   CHKERRCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
2730   c->nz = (PetscInt) C_nnz1;
2731   CHKERRQ(PetscInfo(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024));
2732   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2733   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2734   Ccsr->values = new THRUSTARRAY(c->nz);
2735   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2736   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2737                                 Ccsr->values->data().get());CHKERRCUSPARSE(stat);
2738   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB,
2739                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2740                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2741  #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2742 #else
2743   CHKERRCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST));
2744   stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB,
2745                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2746                              Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2747                              Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2748                              Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat);
2749   c->nz = cnz;
2750   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2751   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2752   Ccsr->values = new THRUSTARRAY(c->nz);
2753   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2754 
2755   CHKERRCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2756   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
2757      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
2758      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
2759   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB,
2760                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2761                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2762                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2763                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2764 #endif
2765   CHKERRQ(PetscLogGpuFlops(mmdata->flops));
2766   CHKERRQ(PetscLogGpuTimeEnd());
2767 finalizesym:
2768   c->singlemalloc = PETSC_FALSE;
2769   c->free_a       = PETSC_TRUE;
2770   c->free_ij      = PETSC_TRUE;
2771   CHKERRQ(PetscMalloc1(m+1,&c->i));
2772   CHKERRQ(PetscMalloc1(c->nz,&c->j));
2773   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
2774     PetscInt *d_i = c->i;
2775     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
2776     THRUSTINTARRAY jj(Ccsr->column_indices->size());
2777     ii   = *Ccsr->row_offsets;
2778     jj   = *Ccsr->column_indices;
2779     if (ciscompressed) d_i = c->compressedrow.i;
2780     CHKERRCUDA(cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
2781     CHKERRCUDA(cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
2782   } else {
2783     PetscInt *d_i = c->i;
2784     if (ciscompressed) d_i = c->compressedrow.i;
2785     CHKERRCUDA(cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
2786     CHKERRCUDA(cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
2787   }
2788   if (ciscompressed) { /* need to expand host row offsets */
2789     PetscInt r = 0;
2790     c->i[0] = 0;
2791     for (k = 0; k < c->compressedrow.nrows; k++) {
2792       const PetscInt next = c->compressedrow.rindex[k];
2793       const PetscInt old = c->compressedrow.i[k];
2794       for (; r < next; r++) c->i[r+1] = old;
2795     }
2796     for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows];
2797   }
2798   CHKERRQ(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt)));
2799   CHKERRQ(PetscMalloc1(m,&c->ilen));
2800   CHKERRQ(PetscMalloc1(m,&c->imax));
2801   c->maxnz = c->nz;
2802   c->nonzerorowcnt = 0;
2803   c->rmax = 0;
2804   for (k = 0; k < m; k++) {
2805     const PetscInt nn = c->i[k+1] - c->i[k];
2806     c->ilen[k] = c->imax[k] = nn;
2807     c->nonzerorowcnt += (PetscInt)!!nn;
2808     c->rmax = PetscMax(c->rmax,nn);
2809   }
2810   CHKERRQ(MatMarkDiagonal_SeqAIJ(C));
2811   CHKERRQ(PetscMalloc1(c->nz,&c->a));
2812   Ccsr->num_entries = c->nz;
2813 
2814   C->nonzerostate++;
2815   CHKERRQ(PetscLayoutSetUp(C->rmap));
2816   CHKERRQ(PetscLayoutSetUp(C->cmap));
2817   Ccusp->nonzerostate = C->nonzerostate;
2818   C->offloadmask   = PETSC_OFFLOAD_UNALLOCATED;
2819   C->preallocated  = PETSC_TRUE;
2820   C->assembled     = PETSC_FALSE;
2821   C->was_assembled = PETSC_FALSE;
2822   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
2823     mmdata->reusesym = PETSC_TRUE;
2824     C->offloadmask   = PETSC_OFFLOAD_GPU;
2825   }
2826   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2827   PetscFunctionReturn(0);
2828 }
2829 
2830 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
2831 
2832 /* handles sparse or dense B */
2833 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
2834 {
2835   Mat_Product    *product = mat->product;
2836   PetscErrorCode ierr;
2837   PetscBool      isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE;
2838 
2839   PetscFunctionBegin;
2840   MatCheckProduct(mat,1);
2841   CHKERRQ(PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense));
2842   if (!product->A->boundtocpu && !product->B->boundtocpu) {
2843     CHKERRQ(PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp));
2844   }
2845   if (product->type == MATPRODUCT_ABC) {
2846     Ciscusp = PETSC_FALSE;
2847     if (!product->C->boundtocpu) {
2848       CHKERRQ(PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp));
2849     }
2850   }
2851   if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
2852     PetscBool usecpu = PETSC_FALSE;
2853     switch (product->type) {
2854     case MATPRODUCT_AB:
2855       if (product->api_user) {
2856         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMult","Mat");CHKERRQ(ierr);
2857         CHKERRQ(PetscOptionsBool("-matmatmult_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL));
2858         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2859       } else {
2860         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AB","Mat");CHKERRQ(ierr);
2861         CHKERRQ(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL));
2862         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2863       }
2864       break;
2865     case MATPRODUCT_AtB:
2866       if (product->api_user) {
2867         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatTransposeMatMult","Mat");CHKERRQ(ierr);
2868         CHKERRQ(PetscOptionsBool("-mattransposematmult_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL));
2869         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2870       } else {
2871         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AtB","Mat");CHKERRQ(ierr);
2872         CHKERRQ(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL));
2873         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2874       }
2875       break;
2876     case MATPRODUCT_PtAP:
2877       if (product->api_user) {
2878         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatPtAP","Mat");CHKERRQ(ierr);
2879         CHKERRQ(PetscOptionsBool("-matptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL));
2880         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2881       } else {
2882         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_PtAP","Mat");CHKERRQ(ierr);
2883         CHKERRQ(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL));
2884         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2885       }
2886       break;
2887     case MATPRODUCT_RARt:
2888       if (product->api_user) {
2889         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatRARt","Mat");CHKERRQ(ierr);
2890         CHKERRQ(PetscOptionsBool("-matrart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL));
2891         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2892       } else {
2893         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_RARt","Mat");CHKERRQ(ierr);
2894         CHKERRQ(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL));
2895         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2896       }
2897       break;
2898     case MATPRODUCT_ABC:
2899       if (product->api_user) {
2900         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMatMult","Mat");CHKERRQ(ierr);
2901         CHKERRQ(PetscOptionsBool("-matmatmatmult_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL));
2902         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2903       } else {
2904         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_ABC","Mat");CHKERRQ(ierr);
2905         CHKERRQ(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL));
2906         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2907       }
2908       break;
2909     default:
2910       break;
2911     }
2912     if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
2913   }
2914   /* dispatch */
2915   if (isdense) {
2916     switch (product->type) {
2917     case MATPRODUCT_AB:
2918     case MATPRODUCT_AtB:
2919     case MATPRODUCT_ABt:
2920     case MATPRODUCT_PtAP:
2921     case MATPRODUCT_RARt:
2922      if (product->A->boundtocpu) {
2923         CHKERRQ(MatProductSetFromOptions_SeqAIJ_SeqDense(mat));
2924       } else {
2925         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2926       }
2927       break;
2928     case MATPRODUCT_ABC:
2929       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2930       break;
2931     default:
2932       break;
2933     }
2934   } else if (Biscusp && Ciscusp) {
2935     switch (product->type) {
2936     case MATPRODUCT_AB:
2937     case MATPRODUCT_AtB:
2938     case MATPRODUCT_ABt:
2939       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2940       break;
2941     case MATPRODUCT_PtAP:
2942     case MATPRODUCT_RARt:
2943     case MATPRODUCT_ABC:
2944       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2945       break;
2946     default:
2947       break;
2948     }
2949   } else { /* fallback for AIJ */
2950     CHKERRQ(MatProductSetFromOptions_SeqAIJ(mat));
2951   }
2952   PetscFunctionReturn(0);
2953 }
2954 
2955 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2956 {
2957   PetscFunctionBegin;
2958   CHKERRQ(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE));
2959   PetscFunctionReturn(0);
2960 }
2961 
2962 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
2963 {
2964   PetscFunctionBegin;
2965   CHKERRQ(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE));
2966   PetscFunctionReturn(0);
2967 }
2968 
2969 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2970 {
2971   PetscFunctionBegin;
2972   CHKERRQ(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE));
2973   PetscFunctionReturn(0);
2974 }
2975 
2976 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2977 {
2978   PetscFunctionBegin;
2979   CHKERRQ(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE));
2980   PetscFunctionReturn(0);
2981 }
2982 
2983 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2984 {
2985   PetscFunctionBegin;
2986   CHKERRQ(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE));
2987   PetscFunctionReturn(0);
2988 }
2989 
2990 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y)
2991 {
2992   int i = blockIdx.x*blockDim.x + threadIdx.x;
2993   if (i < n) y[idx[i]] += x[i];
2994 }
2995 
2996 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
2997 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
2998 {
2999   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
3000   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
3001   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
3002   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
3003   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
3004   PetscBool                    compressed;
3005 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3006   PetscInt                     nx,ny;
3007 #endif
3008 
3009   PetscFunctionBegin;
3010   PetscCheckFalse(herm && !trans,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Hermitian and not transpose not supported");
3011   if (!a->nz) {
3012     if (!yy) CHKERRQ(VecSet_SeqCUDA(zz,0));
3013     else CHKERRQ(VecCopy_SeqCUDA(yy,zz));
3014     PetscFunctionReturn(0);
3015   }
3016   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
3017   CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(A));
3018   if (!trans) {
3019     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
3020     PetscCheck(matstruct,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
3021   } else {
3022     if (herm || !A->form_explicit_transpose) {
3023       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
3024       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
3025     } else {
3026       if (!cusparsestruct->matTranspose) CHKERRQ(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3027       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
3028     }
3029   }
3030   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3031   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
3032 
3033   try {
3034     CHKERRQ(VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray));
3035     if (yy == zz) CHKERRQ(VecCUDAGetArray(zz,&zarray)); /* read & write zz, so need to get uptodate zarray on GPU */
3036     else CHKERRQ(VecCUDAGetArrayWrite(zz,&zarray)); /* write zz, so no need to init zarray on GPU */
3037 
3038     CHKERRQ(PetscLogGpuTimeBegin());
3039     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3040       /* z = A x + beta y.
3041          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3042          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3043       */
3044       xptr = xarray;
3045       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
3046       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3047      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3048       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3049           allocated to accommodate different uses. So we get the length info directly from mat.
3050        */
3051       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3052         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3053         nx = mat->num_cols;
3054         ny = mat->num_rows;
3055       }
3056      #endif
3057     } else {
3058       /* z = A^T x + beta y
3059          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3060          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3061        */
3062       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
3063       dptr = zarray;
3064       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3065       if (compressed) { /* Scatter x to work vector */
3066         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3067         thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
3068                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3069                          VecCUDAEqualsReverse());
3070       }
3071      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3072       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3073         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3074         nx = mat->num_rows;
3075         ny = mat->num_cols;
3076       }
3077      #endif
3078     }
3079 
3080     /* csr_spmv does y = alpha op(A) x + beta y */
3081     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3082      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3083       PetscCheck(opA >= 0 && opA <= 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
3084       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
3085         CHKERRCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype));
3086         CHKERRCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype));
3087         CHKERRCUSPARSE(cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
3088                                                matstruct->matDescr,
3089                                                matstruct->cuSpMV[opA].vecXDescr, beta,
3090                                                matstruct->cuSpMV[opA].vecYDescr,
3091                                                cusparse_scalartype,
3092                                                cusparsestruct->spmvAlg,
3093                                                &matstruct->cuSpMV[opA].spmvBufferSize));
3094         CHKERRCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize));
3095 
3096         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3097       } else {
3098         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
3099         CHKERRCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr));
3100         CHKERRCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr));
3101       }
3102 
3103       CHKERRCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA,
3104                                   matstruct->alpha_one,
3105                                   matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */
3106                                   matstruct->cuSpMV[opA].vecXDescr,
3107                                   beta,
3108                                   matstruct->cuSpMV[opA].vecYDescr,
3109                                   cusparse_scalartype,
3110                                   cusparsestruct->spmvAlg,
3111                                   matstruct->cuSpMV[opA].spmvBuffer));
3112      #else
3113       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3114       CHKERRCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA,
3115                                        mat->num_rows, mat->num_cols,
3116                                        mat->num_entries, matstruct->alpha_one, matstruct->descr,
3117                                        mat->values->data().get(), mat->row_offsets->data().get(),
3118                                        mat->column_indices->data().get(), xptr, beta,
3119                                        dptr));
3120      #endif
3121     } else {
3122       if (cusparsestruct->nrows) {
3123        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3124         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3125        #else
3126         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
3127         CHKERRCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA,
3128                                          matstruct->alpha_one, matstruct->descr, hybMat,
3129                                          xptr, beta,
3130                                          dptr));
3131        #endif
3132       }
3133     }
3134     CHKERRQ(PetscLogGpuTimeEnd());
3135 
3136     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3137       if (yy) { /* MatMultAdd: zz = A*xx + yy */
3138         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3139           CHKERRQ(VecCopy_SeqCUDA(yy,zz)); /* zz = yy */
3140         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3141           CHKERRQ(VecAXPY_SeqCUDA(zz,1.0,yy)); /* zz += yy */
3142         }
3143       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3144         CHKERRQ(VecSet_SeqCUDA(zz,0));
3145       }
3146 
3147       /* ScatterAdd the result from work vector into the full vector when A is compressed */
3148       if (compressed) {
3149         CHKERRQ(PetscLogGpuTimeBegin());
3150         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred)
3151            and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
3152            prevent that. So I just add a ScatterAdd kernel.
3153          */
3154        #if 0
3155         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
3156         thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
3157                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
3158                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3159                          VecCUDAPlusEquals());
3160        #else
3161         PetscInt n = matstruct->cprowIndices->size();
3162         ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray);
3163        #endif
3164         CHKERRQ(PetscLogGpuTimeEnd());
3165       }
3166     } else {
3167       if (yy && yy != zz) {
3168         CHKERRQ(VecAXPY_SeqCUDA(zz,1.0,yy)); /* zz += yy */
3169       }
3170     }
3171     CHKERRQ(VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray));
3172     if (yy == zz) CHKERRQ(VecCUDARestoreArray(zz,&zarray));
3173     else CHKERRQ(VecCUDARestoreArrayWrite(zz,&zarray));
3174   } catch(char *ex) {
3175     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
3176   }
3177   if (yy) {
3178     CHKERRQ(PetscLogGpuFlops(2.0*a->nz));
3179   } else {
3180     CHKERRQ(PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt));
3181   }
3182   PetscFunctionReturn(0);
3183 }
3184 
3185 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
3186 {
3187   PetscFunctionBegin;
3188   CHKERRQ(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE));
3189   PetscFunctionReturn(0);
3190 }
3191 
3192 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
3193 {
3194   PetscObjectState   onnz = A->nonzerostate;
3195   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3196 
3197   PetscFunctionBegin;
3198   CHKERRQ(MatAssemblyEnd_SeqAIJ(A,mode));
3199   if (onnz != A->nonzerostate && cusp->deviceMat) {
3200 
3201     CHKERRQ(PetscInfo(A,"Destroy device mat since nonzerostate changed\n"));
3202     CHKERRCUDA(cudaFree(cusp->deviceMat));
3203     cusp->deviceMat = NULL;
3204   }
3205   PetscFunctionReturn(0);
3206 }
3207 
3208 /* --------------------------------------------------------------------------------*/
3209 /*@
3210    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
3211    (the default parallel PETSc format). This matrix will ultimately pushed down
3212    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
3213    assembly performance the user should preallocate the matrix storage by setting
3214    the parameter nz (or the array nnz).  By setting these parameters accurately,
3215    performance during matrix assembly can be increased by more than a factor of 50.
3216 
3217    Collective
3218 
3219    Input Parameters:
3220 +  comm - MPI communicator, set to PETSC_COMM_SELF
3221 .  m - number of rows
3222 .  n - number of columns
3223 .  nz - number of nonzeros per row (same for all rows)
3224 -  nnz - array containing the number of nonzeros in the various rows
3225          (possibly different for each row) or NULL
3226 
3227    Output Parameter:
3228 .  A - the matrix
3229 
3230    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
3231    MatXXXXSetPreallocation() paradgm instead of this routine directly.
3232    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
3233 
3234    Notes:
3235    If nnz is given then nz is ignored
3236 
3237    The AIJ format (also called the Yale sparse matrix format or
3238    compressed row storage), is fully compatible with standard Fortran 77
3239    storage.  That is, the stored row and column indices can begin at
3240    either one (as in Fortran) or zero.  See the users' manual for details.
3241 
3242    Specify the preallocated storage with either nz or nnz (not both).
3243    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
3244    allocation.  For large problems you MUST preallocate memory or you
3245    will get TERRIBLE performance, see the users' manual chapter on matrices.
3246 
3247    By default, this format uses inodes (identical nodes) when possible, to
3248    improve numerical efficiency of matrix-vector products and solves. We
3249    search for consecutive rows with the same nonzero structure, thereby
3250    reusing matrix information to achieve increased efficiency.
3251 
3252    Level: intermediate
3253 
3254 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
3255 @*/
3256 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
3257 {
3258   PetscFunctionBegin;
3259   CHKERRQ(MatCreate(comm,A));
3260   CHKERRQ(MatSetSizes(*A,m,n,m,n));
3261   CHKERRQ(MatSetType(*A,MATSEQAIJCUSPARSE));
3262   CHKERRQ(MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz));
3263   PetscFunctionReturn(0);
3264 }
3265 
3266 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3267 {
3268   PetscFunctionBegin;
3269   if (A->factortype == MAT_FACTOR_NONE) {
3270     CHKERRQ(MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr));
3271   } else {
3272     CHKERRQ(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr));
3273   }
3274   CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL));
3275   CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL));
3276   CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetUseCPUSolve_C",NULL));
3277   CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL));
3278   CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL));
3279   CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL));
3280   CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL));
3281   CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL));
3282   CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL));
3283   CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaijcusparse_hypre_C",NULL));
3284   CHKERRQ(MatDestroy_SeqAIJ(A));
3285   PetscFunctionReturn(0);
3286 }
3287 
3288 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
3289 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
3290 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
3291 {
3292   PetscFunctionBegin;
3293   CHKERRQ(MatDuplicate_SeqAIJ(A,cpvalues,B));
3294   CHKERRQ(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B));
3295   PetscFunctionReturn(0);
3296 }
3297 
3298 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str)
3299 {
3300   Mat_SeqAIJ         *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
3301   Mat_SeqAIJCUSPARSE *cy;
3302   Mat_SeqAIJCUSPARSE *cx;
3303   PetscScalar        *ay;
3304   const PetscScalar  *ax;
3305   CsrMatrix          *csry,*csrx;
3306 
3307   PetscFunctionBegin;
3308   cy = (Mat_SeqAIJCUSPARSE*)Y->spptr;
3309   cx = (Mat_SeqAIJCUSPARSE*)X->spptr;
3310   if (X->ops->axpy != Y->ops->axpy) {
3311     CHKERRQ(MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE));
3312     CHKERRQ(MatAXPY_SeqAIJ(Y,a,X,str));
3313     PetscFunctionReturn(0);
3314   }
3315   /* if we are here, it means both matrices are bound to GPU */
3316   CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(Y));
3317   CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(X));
3318   PetscCheck(cy->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)Y),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
3319   PetscCheck(cx->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)X),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
3320   csry = (CsrMatrix*)cy->mat->mat;
3321   csrx = (CsrMatrix*)cx->mat->mat;
3322   /* see if we can turn this into a cublas axpy */
3323   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3324     bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin());
3325     if (eq) {
3326       eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin());
3327     }
3328     if (eq) str = SAME_NONZERO_PATTERN;
3329   }
3330   /* spgeam is buggy with one column */
3331   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3332 
3333   if (str == SUBSET_NONZERO_PATTERN) {
3334     PetscScalar b = 1.0;
3335 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3336     size_t      bufferSize;
3337     void        *buffer;
3338 #endif
3339 
3340     CHKERRQ(MatSeqAIJCUSPARSEGetArrayRead(X,&ax));
3341     CHKERRQ(MatSeqAIJCUSPARSEGetArray(Y,&ay));
3342     CHKERRCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST));
3343 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3344     CHKERRCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n,
3345                                                   &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3346                                                   &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3347                                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize));
3348     CHKERRCUDA(cudaMalloc(&buffer,bufferSize));
3349     CHKERRQ(PetscLogGpuTimeBegin());
3350     CHKERRCUSPARSE(cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3351                                        &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3352                                        &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3353                                        cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer));
3354     CHKERRQ(PetscLogGpuFlops(x->nz + y->nz));
3355     CHKERRQ(PetscLogGpuTimeEnd());
3356     CHKERRCUDA(cudaFree(buffer));
3357 #else
3358     CHKERRQ(PetscLogGpuTimeBegin());
3359     CHKERRCUSPARSE(cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3360                                        &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3361                                        &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3362                                        cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get()));
3363     CHKERRQ(PetscLogGpuFlops(x->nz + y->nz));
3364     CHKERRQ(PetscLogGpuTimeEnd());
3365 #endif
3366     CHKERRCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE));
3367     CHKERRQ(MatSeqAIJCUSPARSERestoreArrayRead(X,&ax));
3368     CHKERRQ(MatSeqAIJCUSPARSERestoreArray(Y,&ay));
3369     CHKERRQ(MatSeqAIJInvalidateDiagonal(Y));
3370   } else if (str == SAME_NONZERO_PATTERN) {
3371     cublasHandle_t cublasv2handle;
3372     PetscBLASInt   one = 1, bnz = 1;
3373 
3374     CHKERRQ(MatSeqAIJCUSPARSEGetArrayRead(X,&ax));
3375     CHKERRQ(MatSeqAIJCUSPARSEGetArray(Y,&ay));
3376     CHKERRQ(PetscCUBLASGetHandle(&cublasv2handle));
3377     CHKERRQ(PetscBLASIntCast(x->nz,&bnz));
3378     CHKERRQ(PetscLogGpuTimeBegin());
3379     CHKERRCUBLAS(cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one));
3380     CHKERRQ(PetscLogGpuFlops(2.0*bnz));
3381     CHKERRQ(PetscLogGpuTimeEnd());
3382     CHKERRQ(MatSeqAIJCUSPARSERestoreArrayRead(X,&ax));
3383     CHKERRQ(MatSeqAIJCUSPARSERestoreArray(Y,&ay));
3384     CHKERRQ(MatSeqAIJInvalidateDiagonal(Y));
3385   } else {
3386     CHKERRQ(MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE));
3387     CHKERRQ(MatAXPY_SeqAIJ(Y,a,X,str));
3388   }
3389   PetscFunctionReturn(0);
3390 }
3391 
3392 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a)
3393 {
3394   Mat_SeqAIJ     *y = (Mat_SeqAIJ*)Y->data;
3395   PetscScalar    *ay;
3396   cublasHandle_t cublasv2handle;
3397   PetscBLASInt   one = 1, bnz = 1;
3398 
3399   PetscFunctionBegin;
3400   CHKERRQ(MatSeqAIJCUSPARSEGetArray(Y,&ay));
3401   CHKERRQ(PetscCUBLASGetHandle(&cublasv2handle));
3402   CHKERRQ(PetscBLASIntCast(y->nz,&bnz));
3403   CHKERRQ(PetscLogGpuTimeBegin());
3404   CHKERRCUBLAS(cublasXscal(cublasv2handle,bnz,&a,ay,one));
3405   CHKERRQ(PetscLogGpuFlops(bnz));
3406   CHKERRQ(PetscLogGpuTimeEnd());
3407   CHKERRQ(MatSeqAIJCUSPARSERestoreArray(Y,&ay));
3408   CHKERRQ(MatSeqAIJInvalidateDiagonal(Y));
3409   PetscFunctionReturn(0);
3410 }
3411 
3412 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3413 {
3414   PetscBool      both = PETSC_FALSE;
3415   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3416 
3417   PetscFunctionBegin;
3418   if (A->factortype == MAT_FACTOR_NONE) {
3419     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
3420     if (spptr->mat) {
3421       CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat;
3422       if (matrix->values) {
3423         both = PETSC_TRUE;
3424         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3425       }
3426     }
3427     if (spptr->matTranspose) {
3428       CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat;
3429       if (matrix->values) {
3430         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3431       }
3432     }
3433   }
3434   CHKERRQ(PetscArrayzero(a->a,a->i[A->rmap->n]));
3435   CHKERRQ(MatSeqAIJInvalidateDiagonal(A));
3436   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3437   else A->offloadmask = PETSC_OFFLOAD_CPU;
3438   PetscFunctionReturn(0);
3439 }
3440 
3441 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
3442 {
3443   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3444 
3445   PetscFunctionBegin;
3446   if (A->factortype != MAT_FACTOR_NONE) {
3447     A->boundtocpu = flg;
3448     PetscFunctionReturn(0);
3449   }
3450   if (flg) {
3451     CHKERRQ(MatSeqAIJCUSPARSECopyFromGPU(A));
3452 
3453     A->ops->scale                     = MatScale_SeqAIJ;
3454     A->ops->axpy                      = MatAXPY_SeqAIJ;
3455     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3456     A->ops->mult                      = MatMult_SeqAIJ;
3457     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3458     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3459     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3460     A->ops->multhermitiantranspose    = NULL;
3461     A->ops->multhermitiantransposeadd = NULL;
3462     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
3463     CHKERRQ(PetscMemzero(a->ops,sizeof(Mat_SeqAIJOps)));
3464     CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL));
3465     CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL));
3466     CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL));
3467     CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL));
3468     CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL));
3469     CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ));
3470     CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL));
3471   } else {
3472     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
3473     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3474     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3475     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
3476     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
3477     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
3478     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
3479     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3480     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3481     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
3482     a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJCUSPARSE;
3483     a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJCUSPARSE;
3484     a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE;
3485     a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE;
3486     a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE;
3487     a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE;
3488     CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE));
3489     CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE));
3490     CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE));
3491     CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE));
3492     CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE));
3493     CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE));
3494   }
3495   A->boundtocpu = flg;
3496   if (flg && a->inode.size) {
3497     a->inode.use = PETSC_TRUE;
3498   } else {
3499     a->inode.use = PETSC_FALSE;
3500   }
3501   PetscFunctionReturn(0);
3502 }
3503 
3504 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
3505 {
3506   Mat              B;
3507 
3508   PetscFunctionBegin;
3509   CHKERRQ(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */
3510   if (reuse == MAT_INITIAL_MATRIX) {
3511     CHKERRQ(MatDuplicate(A,MAT_COPY_VALUES,newmat));
3512   } else if (reuse == MAT_REUSE_MATRIX) {
3513     CHKERRQ(MatCopy(A,*newmat,SAME_NONZERO_PATTERN));
3514   }
3515   B = *newmat;
3516 
3517   CHKERRQ(PetscFree(B->defaultvectype));
3518   CHKERRQ(PetscStrallocpy(VECCUDA,&B->defaultvectype));
3519 
3520   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
3521     if (B->factortype == MAT_FACTOR_NONE) {
3522       Mat_SeqAIJCUSPARSE *spptr;
3523       CHKERRQ(PetscNew(&spptr));
3524       CHKERRCUSPARSE(cusparseCreate(&spptr->handle));
3525       CHKERRCUSPARSE(cusparseSetStream(spptr->handle,PetscDefaultCudaStream));
3526       spptr->format     = MAT_CUSPARSE_CSR;
3527      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3528      #if PETSC_PKG_CUDA_VERSION_GE(11,2,0)
3529       spptr->spmvAlg    = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
3530      #else
3531       spptr->spmvAlg    = CUSPARSE_CSRMV_ALG1;    /* default, since we only support csr */
3532      #endif
3533       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
3534       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
3535      #endif
3536       B->spptr = spptr;
3537     } else {
3538       Mat_SeqAIJCUSPARSETriFactors *spptr;
3539 
3540       CHKERRQ(PetscNew(&spptr));
3541       CHKERRCUSPARSE(cusparseCreate(&spptr->handle));
3542       CHKERRCUSPARSE(cusparseSetStream(spptr->handle,PetscDefaultCudaStream));
3543       B->spptr = spptr;
3544     }
3545     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
3546   }
3547   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
3548   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
3549   B->ops->setoption      = MatSetOption_SeqAIJCUSPARSE;
3550   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
3551   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
3552   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
3553 
3554   CHKERRQ(MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE));
3555   CHKERRQ(PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE));
3556   CHKERRQ(PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE));
3557 #if defined(PETSC_HAVE_HYPRE)
3558   CHKERRQ(PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijcusparse_hypre_C",MatConvert_AIJ_HYPRE));
3559 #endif
3560   CHKERRQ(PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetUseCPUSolve_C",MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE));
3561   PetscFunctionReturn(0);
3562 }
3563 
3564 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
3565 {
3566   PetscFunctionBegin;
3567   CHKERRQ(MatCreate_SeqAIJ(B));
3568   CHKERRQ(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B));
3569   PetscFunctionReturn(0);
3570 }
3571 
3572 /*MC
3573    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
3574 
3575    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
3576    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
3577    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
3578 
3579    Options Database Keys:
3580 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
3581 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3582 -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3583 +  -mat_cusparse_use_cpu_solve - Do MatSolve on CPU
3584 
3585   Level: beginner
3586 
3587 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
3588 M*/
3589 
3590 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*);
3591 
3592 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
3593 {
3594   PetscFunctionBegin;
3595   CHKERRQ(MatSolverTypeRegister(MATSOLVERCUSPARSEBAND,MATSEQAIJ,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band));
3596   CHKERRQ(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse));
3597   CHKERRQ(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse));
3598   CHKERRQ(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse));
3599   CHKERRQ(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse));
3600 
3601   PetscFunctionReturn(0);
3602 }
3603 
3604 static PetscErrorCode MatResetPreallocationCOO_SeqAIJCUSPARSE(Mat mat)
3605 {
3606   Mat_SeqAIJCUSPARSE* cusp = (Mat_SeqAIJCUSPARSE*)mat->spptr;
3607   cudaError_t         cerr;
3608 
3609   PetscFunctionBegin;
3610   if (!cusp) PetscFunctionReturn(0);
3611   delete cusp->cooPerm;
3612   delete cusp->cooPerm_a;
3613   cusp->cooPerm = NULL;
3614   cusp->cooPerm_a = NULL;
3615   if (cusp->use_extended_coo) {
3616     CHKERRCUDA(cudaFree(cusp->jmap_d));
3617     CHKERRCUDA(cudaFree(cusp->perm_d));
3618   }
3619   cusp->use_extended_coo = PETSC_FALSE;
3620   PetscFunctionReturn(0);
3621 }
3622 
3623 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
3624 {
3625   PetscFunctionBegin;
3626   if (*cusparsestruct) {
3627     CHKERRQ(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format));
3628     CHKERRQ(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format));
3629     delete (*cusparsestruct)->workVector;
3630     delete (*cusparsestruct)->rowoffsets_gpu;
3631     delete (*cusparsestruct)->cooPerm;
3632     delete (*cusparsestruct)->cooPerm_a;
3633     delete (*cusparsestruct)->csr2csc_i;
3634     if ((*cusparsestruct)->handle) CHKERRCUSPARSE(cusparseDestroy((*cusparsestruct)->handle));
3635     if ((*cusparsestruct)->jmap_d) CHKERRCUDA(cudaFree((*cusparsestruct)->jmap_d));
3636     if ((*cusparsestruct)->perm_d) CHKERRCUDA(cudaFree((*cusparsestruct)->perm_d));
3637     CHKERRQ(PetscFree(*cusparsestruct));
3638   }
3639   PetscFunctionReturn(0);
3640 }
3641 
3642 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
3643 {
3644   PetscFunctionBegin;
3645   if (*mat) {
3646     delete (*mat)->values;
3647     delete (*mat)->column_indices;
3648     delete (*mat)->row_offsets;
3649     delete *mat;
3650     *mat = 0;
3651   }
3652   PetscFunctionReturn(0);
3653 }
3654 
3655 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
3656 {
3657   PetscFunctionBegin;
3658   if (*trifactor) {
3659     if ((*trifactor)->descr) CHKERRCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr));
3660     if ((*trifactor)->solveInfo) CHKERRCUSPARSE(cusparse_destroy_analysis_info((*trifactor)->solveInfo));
3661     CHKERRQ(CsrMatrix_Destroy(&(*trifactor)->csrMat));
3662     if ((*trifactor)->solveBuffer)   CHKERRCUDA(cudaFree((*trifactor)->solveBuffer));
3663     if ((*trifactor)->AA_h)   CHKERRCUDA(cudaFreeHost((*trifactor)->AA_h));
3664    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3665     if ((*trifactor)->csr2cscBuffer) CHKERRCUDA(cudaFree((*trifactor)->csr2cscBuffer));
3666    #endif
3667     CHKERRQ(PetscFree(*trifactor));
3668   }
3669   PetscFunctionReturn(0);
3670 }
3671 
3672 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
3673 {
3674   CsrMatrix        *mat;
3675 
3676   PetscFunctionBegin;
3677   if (*matstruct) {
3678     if ((*matstruct)->mat) {
3679       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
3680        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3681         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3682        #else
3683         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
3684         CHKERRCUSPARSE(cusparseDestroyHybMat(hybMat));
3685        #endif
3686       } else {
3687         mat = (CsrMatrix*)(*matstruct)->mat;
3688         CsrMatrix_Destroy(&mat);
3689       }
3690     }
3691     if ((*matstruct)->descr) CHKERRCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr));
3692     delete (*matstruct)->cprowIndices;
3693     if ((*matstruct)->alpha_one) CHKERRCUDA(cudaFree((*matstruct)->alpha_one));
3694     if ((*matstruct)->beta_zero) CHKERRCUDA(cudaFree((*matstruct)->beta_zero));
3695     if ((*matstruct)->beta_one)  CHKERRCUDA(cudaFree((*matstruct)->beta_one));
3696 
3697    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3698     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
3699     if (mdata->matDescr) CHKERRCUSPARSE(cusparseDestroySpMat(mdata->matDescr));
3700     for (int i=0; i<3; i++) {
3701       if (mdata->cuSpMV[i].initialized) {
3702         CHKERRCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer));
3703         CHKERRCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr));
3704         CHKERRCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr));
3705       }
3706     }
3707    #endif
3708     delete *matstruct;
3709     *matstruct = NULL;
3710   }
3711   PetscFunctionReturn(0);
3712 }
3713 
3714 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p* trifactors)
3715 {
3716   PetscFunctionBegin;
3717   if (*trifactors) {
3718     CHKERRQ(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr));
3719     CHKERRQ(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr));
3720     CHKERRQ(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose));
3721     CHKERRQ(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose));
3722     delete (*trifactors)->rpermIndices;
3723     delete (*trifactors)->cpermIndices;
3724     delete (*trifactors)->workVector;
3725     (*trifactors)->rpermIndices = NULL;
3726     (*trifactors)->cpermIndices = NULL;
3727     (*trifactors)->workVector = NULL;
3728     if ((*trifactors)->a_band_d)   CHKERRCUDA(cudaFree((*trifactors)->a_band_d));
3729     if ((*trifactors)->i_band_d)   CHKERRCUDA(cudaFree((*trifactors)->i_band_d));
3730     (*trifactors)->init_dev_prop = PETSC_FALSE;
3731   }
3732   PetscFunctionReturn(0);
3733 }
3734 
3735 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
3736 {
3737   cusparseHandle_t handle;
3738 
3739   PetscFunctionBegin;
3740   if (*trifactors) {
3741     CHKERRQ(MatSeqAIJCUSPARSETriFactors_Reset(trifactors));
3742     if (handle = (*trifactors)->handle) {
3743       CHKERRCUSPARSE(cusparseDestroy(handle));
3744     }
3745     CHKERRQ(PetscFree(*trifactors));
3746   }
3747   PetscFunctionReturn(0);
3748 }
3749 
3750 struct IJCompare
3751 {
3752   __host__ __device__
3753   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3754   {
3755     if (t1.get<0>() < t2.get<0>()) return true;
3756     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3757     return false;
3758   }
3759 };
3760 
3761 struct IJEqual
3762 {
3763   __host__ __device__
3764   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3765   {
3766     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
3767     return true;
3768   }
3769 };
3770 
3771 struct IJDiff
3772 {
3773   __host__ __device__
3774   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3775   {
3776     return t1 == t2 ? 0 : 1;
3777   }
3778 };
3779 
3780 struct IJSum
3781 {
3782   __host__ __device__
3783   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3784   {
3785     return t1||t2;
3786   }
3787 };
3788 
3789 #include <thrust/iterator/discard_iterator.h>
3790 /* Associated with MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic() */
3791 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode)
3792 {
3793   Mat_SeqAIJCUSPARSE                    *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3794   Mat_SeqAIJ                            *a = (Mat_SeqAIJ*)A->data;
3795   THRUSTARRAY                           *cooPerm_v = NULL;
3796   thrust::device_ptr<const PetscScalar> d_v;
3797   CsrMatrix                             *matrix;
3798   PetscInt                              n;
3799 
3800   PetscFunctionBegin;
3801   PetscCheckFalse(!cusp,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct");
3802   PetscCheckFalse(!cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix");
3803   if (!cusp->cooPerm) {
3804     CHKERRQ(MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY));
3805     CHKERRQ(MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY));
3806     PetscFunctionReturn(0);
3807   }
3808   matrix = (CsrMatrix*)cusp->mat->mat;
3809   PetscCheckFalse(!matrix->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3810   if (!v) {
3811     if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3812     goto finalize;
3813   }
3814   n = cusp->cooPerm->size();
3815   if (isCudaMem(v)) {
3816     d_v = thrust::device_pointer_cast(v);
3817   } else {
3818     cooPerm_v = new THRUSTARRAY(n);
3819     cooPerm_v->assign(v,v+n);
3820     d_v = cooPerm_v->data();
3821     CHKERRQ(PetscLogCpuToGpu(n*sizeof(PetscScalar)));
3822   }
3823   CHKERRQ(PetscLogGpuTimeBegin());
3824   if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
3825     if (cusp->cooPerm_a) { /* there are repeated entries in d_v[], and we need to add these them */
3826       THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
3827       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3828       /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output)
3829         cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[].
3830         cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero.
3831       */
3832       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3833       thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>());
3834       delete cooPerm_w;
3835     } else {
3836       /* all nonzeros in d_v[] are unique entries */
3837       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3838                                                                 matrix->values->begin()));
3839       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3840                                                                 matrix->values->end()));
3841       thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]]  */
3842     }
3843   } else {
3844     if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
3845       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3846       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3847     } else {
3848       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3849                                                                 matrix->values->begin()));
3850       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3851                                                                 matrix->values->end()));
3852       thrust::for_each(zibit,zieit,VecCUDAEquals());
3853     }
3854   }
3855   CHKERRQ(PetscLogGpuTimeEnd());
3856 finalize:
3857   delete cooPerm_v;
3858   A->offloadmask = PETSC_OFFLOAD_GPU;
3859   CHKERRQ(PetscObjectStateIncrease((PetscObject)A));
3860   /* shorter version of MatAssemblyEnd_SeqAIJ */
3861   CHKERRQ(PetscInfo(A,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",A->rmap->n,A->cmap->n,a->nz));
3862   CHKERRQ(PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n"));
3863   CHKERRQ(PetscInfo(A,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",a->rmax));
3864   a->reallocs         = 0;
3865   A->info.mallocs    += 0;
3866   A->info.nz_unneeded = 0;
3867   A->assembled = A->was_assembled = PETSC_TRUE;
3868   A->num_ass++;
3869   PetscFunctionReturn(0);
3870 }
3871 
3872 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
3873 {
3874   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3875 
3876   PetscFunctionBegin;
3877   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3878   if (!cusp) PetscFunctionReturn(0);
3879   if (destroy) {
3880     CHKERRQ(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format));
3881     delete cusp->csr2csc_i;
3882     cusp->csr2csc_i = NULL;
3883   }
3884   A->transupdated = PETSC_FALSE;
3885   PetscFunctionReturn(0);
3886 }
3887 
3888 #include <thrust/binary_search.h>
3889 /* 'Basic' means it only works when coo_i[] and coo_j[] do not contain negative indices */
3890 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(Mat A, PetscCount n, const PetscInt coo_i[], const PetscInt coo_j[])
3891 {
3892   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3893   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
3894   PetscInt           cooPerm_n, nzr = 0;
3895 
3896   PetscFunctionBegin;
3897   CHKERRQ(PetscLayoutSetUp(A->rmap));
3898   CHKERRQ(PetscLayoutSetUp(A->cmap));
3899   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
3900   if (n != cooPerm_n) {
3901     delete cusp->cooPerm;
3902     delete cusp->cooPerm_a;
3903     cusp->cooPerm = NULL;
3904     cusp->cooPerm_a = NULL;
3905   }
3906   if (n) {
3907     THRUSTINTARRAY d_i(n);
3908     THRUSTINTARRAY d_j(n);
3909     THRUSTINTARRAY ii(A->rmap->n);
3910 
3911     if (!cusp->cooPerm)   { cusp->cooPerm   = new THRUSTINTARRAY(n); }
3912     if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); }
3913 
3914     CHKERRQ(PetscLogCpuToGpu(2.*n*sizeof(PetscInt)));
3915     d_i.assign(coo_i,coo_i+n);
3916     d_j.assign(coo_j,coo_j+n);
3917 
3918     /* Ex.
3919       n = 6
3920       coo_i = [3,3,1,4,1,4]
3921       coo_j = [3,2,2,5,2,6]
3922     */
3923     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin()));
3924     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end()));
3925 
3926     CHKERRQ(PetscLogGpuTimeBegin());
3927     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
3928     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */
3929     *cusp->cooPerm_a = d_i; /* copy the sorted array */
3930     THRUSTINTARRAY w = d_j;
3931 
3932     /*
3933       d_i     = [1,1,3,3,4,4]
3934       d_j     = [2,2,2,3,5,6]
3935       cooPerm = [2,4,1,0,3,5]
3936     */
3937     auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */
3938 
3939     /*
3940       d_i     = [1,3,3,4,4,x]
3941                             ^ekey
3942       d_j     = [2,2,3,5,6,x]
3943                            ^nekye
3944     */
3945     if (nekey == ekey) { /* all entries are unique */
3946       delete cusp->cooPerm_a;
3947       cusp->cooPerm_a = NULL;
3948     } else { /* Stefano: I couldn't come up with a more elegant algorithm */
3949       /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */
3950       adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/
3951       adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff());                                              /* w:         [2,2,2,3,5,6] => [2,0,0,1,1,1]*/
3952       (*cusp->cooPerm_a)[0] = 0; /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */
3953       w[0] = 0;
3954       thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); /* cooPerm_a =          [0,0,1,1,1,1]*/
3955       thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/
3956     }
3957     thrust::counting_iterator<PetscInt> search_begin(0);
3958     thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */
3959                         search_begin, search_begin + A->rmap->n,  /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */
3960                         ii.begin()); /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */
3961     CHKERRQ(PetscLogGpuTimeEnd());
3962 
3963     CHKERRQ(MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i));
3964     a->singlemalloc = PETSC_FALSE;
3965     a->free_a       = PETSC_TRUE;
3966     a->free_ij      = PETSC_TRUE;
3967     CHKERRQ(PetscMalloc1(A->rmap->n+1,&a->i));
3968     a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */
3969     CHKERRCUDA(cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost));
3970     a->nz = a->maxnz = a->i[A->rmap->n];
3971     a->rmax = 0;
3972     CHKERRQ(PetscMalloc1(a->nz,&a->a));
3973     CHKERRQ(PetscMalloc1(a->nz,&a->j));
3974     CHKERRCUDA(cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost));
3975     if (!a->ilen) CHKERRQ(PetscMalloc1(A->rmap->n,&a->ilen));
3976     if (!a->imax) CHKERRQ(PetscMalloc1(A->rmap->n,&a->imax));
3977     for (PetscInt i = 0; i < A->rmap->n; i++) {
3978       const PetscInt nnzr = a->i[i+1] - a->i[i];
3979       nzr += (PetscInt)!!(nnzr);
3980       a->ilen[i] = a->imax[i] = nnzr;
3981       a->rmax = PetscMax(a->rmax,nnzr);
3982     }
3983     a->nonzerorowcnt = nzr;
3984     A->preallocated = PETSC_TRUE;
3985     CHKERRQ(PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt)));
3986     CHKERRQ(MatMarkDiagonal_SeqAIJ(A));
3987   } else {
3988     CHKERRQ(MatSeqAIJSetPreallocation(A,0,NULL));
3989   }
3990   CHKERRQ(MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE));
3991 
3992   /* We want to allocate the CUSPARSE struct for matvec now.
3993      The code is so convoluted now that I prefer to copy zeros */
3994   CHKERRQ(PetscArrayzero(a->a,a->nz));
3995   CHKERRQ(MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6));
3996   A->offloadmask = PETSC_OFFLOAD_CPU;
3997   CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(A));
3998   CHKERRQ(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE));
3999   PetscFunctionReturn(0);
4000 }
4001 
4002 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[])
4003 {
4004   Mat_SeqAIJ         *seq;
4005   Mat_SeqAIJCUSPARSE *dev;
4006   PetscBool          coo_basic = PETSC_TRUE;
4007   PetscMemType       mtype = PETSC_MEMTYPE_DEVICE;
4008 
4009   PetscFunctionBegin;
4010   CHKERRQ(MatResetPreallocationCOO_SeqAIJ(mat));
4011   CHKERRQ(MatResetPreallocationCOO_SeqAIJCUSPARSE(mat));
4012   if (coo_i) {
4013     CHKERRQ(PetscGetMemType(coo_i,&mtype));
4014     if (PetscMemTypeHost(mtype)) {
4015       for (PetscCount k=0; k<coo_n; k++) {
4016         if (coo_i[k] < 0 || coo_j[k] < 0) {coo_basic = PETSC_FALSE; break;}
4017       }
4018     }
4019   }
4020 
4021   if (coo_basic) { /* i,j are on device or do not contain negative indices */
4022     CHKERRQ(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(mat,coo_n,coo_i,coo_j));
4023   } else {
4024     CHKERRQ(MatSetPreallocationCOO_SeqAIJ(mat,coo_n,coo_i,coo_j));
4025     mat->offloadmask = PETSC_OFFLOAD_CPU;
4026     CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(mat));
4027     seq  = static_cast<Mat_SeqAIJ*>(mat->data);
4028     dev  = static_cast<Mat_SeqAIJCUSPARSE*>(mat->spptr);
4029     CHKERRCUDA(cudaMalloc((void**)&dev->jmap_d,(seq->nz+1)*sizeof(PetscCount)));
4030     CHKERRCUDA(cudaMemcpy(dev->jmap_d,seq->jmap,(seq->nz+1)*sizeof(PetscCount),cudaMemcpyHostToDevice));
4031     CHKERRCUDA(cudaMalloc((void**)&dev->perm_d,seq->Atot*sizeof(PetscCount)));
4032     CHKERRCUDA(cudaMemcpy(dev->perm_d,seq->perm,seq->Atot*sizeof(PetscCount),cudaMemcpyHostToDevice));
4033     dev->use_extended_coo = PETSC_TRUE;
4034   }
4035   PetscFunctionReturn(0);
4036 }
4037 
4038 __global__ void MatAddCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount jmap[],const PetscCount perm[],InsertMode imode,PetscScalar a[])
4039 {
4040   PetscCount        i = blockIdx.x*blockDim.x + threadIdx.x;
4041   const PetscCount  grid_size = gridDim.x * blockDim.x;
4042   for (; i<nnz; i+= grid_size) {
4043     PetscScalar sum = 0.0;
4044     for (PetscCount k=jmap[i]; k<jmap[i+1]; k++) sum += kv[perm[k]];
4045     a[i] = (imode == INSERT_VALUES? 0.0 : a[i]) + sum;
4046   }
4047 }
4048 
4049 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
4050 {
4051   Mat_SeqAIJ          *seq = (Mat_SeqAIJ*)A->data;
4052   Mat_SeqAIJCUSPARSE  *dev = (Mat_SeqAIJCUSPARSE*)A->spptr;
4053   PetscCount          Annz = seq->nz;
4054   PetscMemType        memtype;
4055   const PetscScalar   *v1 = v;
4056   PetscScalar         *Aa;
4057 
4058   PetscFunctionBegin;
4059   if (dev->use_extended_coo) {
4060     CHKERRQ(PetscGetMemType(v,&memtype));
4061     if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
4062       CHKERRCUDA(cudaMalloc((void**)&v1,seq->coo_n*sizeof(PetscScalar)));
4063       CHKERRCUDA(cudaMemcpy((void*)v1,v,seq->coo_n*sizeof(PetscScalar),cudaMemcpyHostToDevice));
4064     }
4065 
4066     if (imode == INSERT_VALUES) CHKERRQ(MatSeqAIJCUSPARSEGetArrayWrite(A,&Aa));
4067     else CHKERRQ(MatSeqAIJCUSPARSEGetArray(A,&Aa));
4068 
4069     if (Annz) {
4070       MatAddCOOValues<<<(Annz+255)/256,256>>>(v1,Annz,dev->jmap_d,dev->perm_d,imode,Aa);
4071       CHKERRCUDA(cudaPeekAtLastError());
4072     }
4073 
4074     if (imode == INSERT_VALUES) CHKERRQ(MatSeqAIJCUSPARSERestoreArrayWrite(A,&Aa));
4075     else CHKERRQ(MatSeqAIJCUSPARSERestoreArray(A,&Aa));
4076 
4077     if (PetscMemTypeHost(memtype)) CHKERRCUDA(cudaFree((void*)v1));
4078   } else {
4079     CHKERRQ(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(A,v,imode));
4080   }
4081   PetscFunctionReturn(0);
4082 }
4083 
4084 /*@C
4085     MatSeqAIJCUSPARSEGetIJ - returns the device row storage i and j indices for MATSEQAIJCUSPARSE matrices.
4086 
4087    Not collective
4088 
4089     Input Parameters:
4090 +   A - the matrix
4091 -   compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form
4092 
4093     Output Parameters:
4094 +   ia - the CSR row pointers
4095 -   ja - the CSR column indices
4096 
4097     Level: developer
4098 
4099     Notes:
4100       When compressed is true, the CSR structure does not contain empty rows
4101 
4102 .seealso: MatSeqAIJCUSPARSERestoreIJ(), MatSeqAIJCUSPARSEGetArrayRead()
4103 @*/
4104 PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int** i, const int **j)
4105 {
4106   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4107   CsrMatrix          *csr;
4108   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
4109 
4110   PetscFunctionBegin;
4111   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4112   if (!i || !j) PetscFunctionReturn(0);
4113   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4114   PetscCheckFalse(cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4115   CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(A));
4116   PetscCheckFalse(!cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4117   csr = (CsrMatrix*)cusp->mat->mat;
4118   if (i) {
4119     if (!compressed && a->compressedrow.use) { /* need full row offset */
4120       if (!cusp->rowoffsets_gpu) {
4121         cusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
4122         cusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
4123         CHKERRQ(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt)));
4124       }
4125       *i = cusp->rowoffsets_gpu->data().get();
4126     } else *i = csr->row_offsets->data().get();
4127   }
4128   if (j) *j = csr->column_indices->data().get();
4129   PetscFunctionReturn(0);
4130 }
4131 
4132 /*@C
4133     MatSeqAIJCUSPARSERestoreIJ - restore the device row storage i and j indices obtained with MatSeqAIJCUSPARSEGetIJ()
4134 
4135    Not collective
4136 
4137     Input Parameters:
4138 +   A - the matrix
4139 -   compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form
4140 
4141     Output Parameters:
4142 +   ia - the CSR row pointers
4143 -   ja - the CSR column indices
4144 
4145     Level: developer
4146 
4147 .seealso: MatSeqAIJCUSPARSEGetIJ()
4148 @*/
4149 PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int** i, const int **j)
4150 {
4151   PetscFunctionBegin;
4152   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4153   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4154   if (i) *i = NULL;
4155   if (j) *j = NULL;
4156   PetscFunctionReturn(0);
4157 }
4158 
4159 /*@C
4160    MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
4161 
4162    Not Collective
4163 
4164    Input Parameter:
4165 .   A - a MATSEQAIJCUSPARSE matrix
4166 
4167    Output Parameter:
4168 .   a - pointer to the device data
4169 
4170    Level: developer
4171 
4172    Notes: may trigger host-device copies if up-to-date matrix data is on host
4173 
4174 .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArrayRead()
4175 @*/
4176 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a)
4177 {
4178   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4179   CsrMatrix          *csr;
4180 
4181   PetscFunctionBegin;
4182   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4183   PetscValidPointer(a,2);
4184   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4185   PetscCheckFalse(cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4186   CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(A));
4187   PetscCheckFalse(!cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4188   csr = (CsrMatrix*)cusp->mat->mat;
4189   PetscCheckFalse(!csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4190   *a = csr->values->data().get();
4191   PetscFunctionReturn(0);
4192 }
4193 
4194 /*@C
4195    MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from MatSeqAIJCUSPARSEGetArrayRead()
4196 
4197    Not Collective
4198 
4199    Input Parameter:
4200 .   A - a MATSEQAIJCUSPARSE matrix
4201 
4202    Output Parameter:
4203 .   a - pointer to the device data
4204 
4205    Level: developer
4206 
4207 .seealso: MatSeqAIJCUSPARSEGetArrayRead()
4208 @*/
4209 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a)
4210 {
4211   PetscFunctionBegin;
4212   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4213   PetscValidPointer(a,2);
4214   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4215   *a = NULL;
4216   PetscFunctionReturn(0);
4217 }
4218 
4219 /*@C
4220    MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
4221 
4222    Not Collective
4223 
4224    Input Parameter:
4225 .   A - a MATSEQAIJCUSPARSE matrix
4226 
4227    Output Parameter:
4228 .   a - pointer to the device data
4229 
4230    Level: developer
4231 
4232    Notes: may trigger host-device copies if up-to-date matrix data is on host
4233 
4234 .seealso: MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArray()
4235 @*/
4236 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a)
4237 {
4238   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4239   CsrMatrix          *csr;
4240 
4241   PetscFunctionBegin;
4242   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4243   PetscValidPointer(a,2);
4244   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4245   PetscCheckFalse(cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4246   CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(A));
4247   PetscCheckFalse(!cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4248   csr = (CsrMatrix*)cusp->mat->mat;
4249   PetscCheckFalse(!csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4250   *a = csr->values->data().get();
4251   A->offloadmask = PETSC_OFFLOAD_GPU;
4252   CHKERRQ(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE));
4253   PetscFunctionReturn(0);
4254 }
4255 /*@C
4256    MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from MatSeqAIJCUSPARSEGetArray()
4257 
4258    Not Collective
4259 
4260    Input Parameter:
4261 .   A - a MATSEQAIJCUSPARSE matrix
4262 
4263    Output Parameter:
4264 .   a - pointer to the device data
4265 
4266    Level: developer
4267 
4268 .seealso: MatSeqAIJCUSPARSEGetArray()
4269 @*/
4270 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a)
4271 {
4272   PetscFunctionBegin;
4273   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4274   PetscValidPointer(a,2);
4275   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4276   CHKERRQ(MatSeqAIJInvalidateDiagonal(A));
4277   CHKERRQ(PetscObjectStateIncrease((PetscObject)A));
4278   *a = NULL;
4279   PetscFunctionReturn(0);
4280 }
4281 
4282 /*@C
4283    MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
4284 
4285    Not Collective
4286 
4287    Input Parameter:
4288 .   A - a MATSEQAIJCUSPARSE matrix
4289 
4290    Output Parameter:
4291 .   a - pointer to the device data
4292 
4293    Level: developer
4294 
4295    Notes: does not trigger host-device copies and flags data validity on the GPU
4296 
4297 .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSERestoreArrayWrite()
4298 @*/
4299 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a)
4300 {
4301   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4302   CsrMatrix          *csr;
4303 
4304   PetscFunctionBegin;
4305   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4306   PetscValidPointer(a,2);
4307   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4308   PetscCheckFalse(cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4309   PetscCheckFalse(!cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4310   csr = (CsrMatrix*)cusp->mat->mat;
4311   PetscCheckFalse(!csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4312   *a = csr->values->data().get();
4313   A->offloadmask = PETSC_OFFLOAD_GPU;
4314   CHKERRQ(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE));
4315   PetscFunctionReturn(0);
4316 }
4317 
4318 /*@C
4319    MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from MatSeqAIJCUSPARSEGetArrayWrite()
4320 
4321    Not Collective
4322 
4323    Input Parameter:
4324 .   A - a MATSEQAIJCUSPARSE matrix
4325 
4326    Output Parameter:
4327 .   a - pointer to the device data
4328 
4329    Level: developer
4330 
4331 .seealso: MatSeqAIJCUSPARSEGetArrayWrite()
4332 @*/
4333 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a)
4334 {
4335   PetscFunctionBegin;
4336   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4337   PetscValidPointer(a,2);
4338   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4339   CHKERRQ(MatSeqAIJInvalidateDiagonal(A));
4340   CHKERRQ(PetscObjectStateIncrease((PetscObject)A));
4341   *a = NULL;
4342   PetscFunctionReturn(0);
4343 }
4344 
4345 struct IJCompare4
4346 {
4347   __host__ __device__
4348   inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4349   {
4350     if (t1.get<0>() < t2.get<0>()) return true;
4351     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
4352     return false;
4353   }
4354 };
4355 
4356 struct Shift
4357 {
4358   int _shift;
4359 
4360   Shift(int shift) : _shift(shift) {}
4361   __host__ __device__
4362   inline int operator() (const int &c)
4363   {
4364     return c + _shift;
4365   }
4366 };
4367 
4368 /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */
4369 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
4370 {
4371   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c;
4372   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp;
4373   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
4374   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
4375   PetscInt                     Annz,Bnnz;
4376   cusparseStatus_t             stat;
4377   PetscInt                     i,m,n,zero = 0;
4378 
4379   PetscFunctionBegin;
4380   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4381   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
4382   PetscValidPointer(C,4);
4383   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4384   PetscCheckTypeName(B,MATSEQAIJCUSPARSE);
4385   PetscCheck(A->rmap->n == B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n);
4386   PetscCheckFalse(reuse == MAT_INPLACE_MATRIX,PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
4387   PetscCheckFalse(Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4388   PetscCheckFalse(Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4389   if (reuse == MAT_INITIAL_MATRIX) {
4390     m     = A->rmap->n;
4391     n     = A->cmap->n + B->cmap->n;
4392     CHKERRQ(MatCreate(PETSC_COMM_SELF,C));
4393     CHKERRQ(MatSetSizes(*C,m,n,m,n));
4394     CHKERRQ(MatSetType(*C,MATSEQAIJCUSPARSE));
4395     c     = (Mat_SeqAIJ*)(*C)->data;
4396     Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4397     Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
4398     Ccsr  = new CsrMatrix;
4399     Cmat->cprowIndices      = NULL;
4400     c->compressedrow.use    = PETSC_FALSE;
4401     c->compressedrow.nrows  = 0;
4402     c->compressedrow.i      = NULL;
4403     c->compressedrow.rindex = NULL;
4404     Ccusp->workVector       = NULL;
4405     Ccusp->nrows    = m;
4406     Ccusp->mat      = Cmat;
4407     Ccusp->mat->mat = Ccsr;
4408     Ccsr->num_rows  = m;
4409     Ccsr->num_cols  = n;
4410     CHKERRCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
4411     CHKERRCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
4412     CHKERRCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4413     CHKERRCUDA(cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar)));
4414     CHKERRCUDA(cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar)));
4415     CHKERRCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar)));
4416     CHKERRCUDA(cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
4417     CHKERRCUDA(cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice));
4418     CHKERRCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
4419     CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(A));
4420     CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(B));
4421     PetscCheckFalse(!Acusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4422     PetscCheckFalse(!Bcusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4423 
4424     Acsr = (CsrMatrix*)Acusp->mat->mat;
4425     Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4426     Annz = (PetscInt)Acsr->column_indices->size();
4427     Bnnz = (PetscInt)Bcsr->column_indices->size();
4428     c->nz = Annz + Bnnz;
4429     Ccsr->row_offsets = new THRUSTINTARRAY32(m+1);
4430     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4431     Ccsr->values = new THRUSTARRAY(c->nz);
4432     Ccsr->num_entries = c->nz;
4433     Ccusp->cooPerm = new THRUSTINTARRAY(c->nz);
4434     if (c->nz) {
4435       auto Acoo = new THRUSTINTARRAY32(Annz);
4436       auto Bcoo = new THRUSTINTARRAY32(Bnnz);
4437       auto Ccoo = new THRUSTINTARRAY32(c->nz);
4438       THRUSTINTARRAY32 *Aroff,*Broff;
4439 
4440       if (a->compressedrow.use) { /* need full row offset */
4441         if (!Acusp->rowoffsets_gpu) {
4442           Acusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
4443           Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
4444           CHKERRQ(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt)));
4445         }
4446         Aroff = Acusp->rowoffsets_gpu;
4447       } else Aroff = Acsr->row_offsets;
4448       if (b->compressedrow.use) { /* need full row offset */
4449         if (!Bcusp->rowoffsets_gpu) {
4450           Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
4451           Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
4452           CHKERRQ(PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt)));
4453         }
4454         Broff = Bcusp->rowoffsets_gpu;
4455       } else Broff = Bcsr->row_offsets;
4456       CHKERRQ(PetscLogGpuTimeBegin());
4457       stat = cusparseXcsr2coo(Acusp->handle,
4458                               Aroff->data().get(),
4459                               Annz,
4460                               m,
4461                               Acoo->data().get(),
4462                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4463       stat = cusparseXcsr2coo(Bcusp->handle,
4464                               Broff->data().get(),
4465                               Bnnz,
4466                               m,
4467                               Bcoo->data().get(),
4468                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4469       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
4470       auto Aperm = thrust::make_constant_iterator(1);
4471       auto Bperm = thrust::make_constant_iterator(0);
4472 #if PETSC_PKG_CUDA_VERSION_GE(10,0,0)
4473       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n));
4474       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n));
4475 #else
4476       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
4477       auto Bcib = Bcsr->column_indices->begin();
4478       auto Bcie = Bcsr->column_indices->end();
4479       thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n));
4480 #endif
4481       auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz);
4482       auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm));
4483       auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm));
4484       auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm));
4485       auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm));
4486       auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin()));
4487       auto p1 = Ccusp->cooPerm->begin();
4488       auto p2 = Ccusp->cooPerm->begin();
4489       thrust::advance(p2,Annz);
4490       PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4()));
4491 #if PETSC_PKG_CUDA_VERSION_LT(10,0,0)
4492       thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n));
4493 #endif
4494       auto cci = thrust::make_counting_iterator(zero);
4495       auto cce = thrust::make_counting_iterator(c->nz);
4496 #if 0 //Errors on SUMMIT cuda 11.1.0
4497       PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
4498 #else
4499       auto pred = thrust::identity<int>();
4500       PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred));
4501       PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred));
4502 #endif
4503       stat = cusparseXcoo2csr(Ccusp->handle,
4504                               Ccoo->data().get(),
4505                               c->nz,
4506                               m,
4507                               Ccsr->row_offsets->data().get(),
4508                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4509       CHKERRQ(PetscLogGpuTimeEnd());
4510       delete wPerm;
4511       delete Acoo;
4512       delete Bcoo;
4513       delete Ccoo;
4514 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4515       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries,
4516                                Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(),
4517                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4518                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4519 #endif
4520       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
4521         CHKERRQ(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
4522         CHKERRQ(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
4523         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4524         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4525         CsrMatrix *CcsrT = new CsrMatrix;
4526         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4527         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4528 
4529         (*C)->form_explicit_transpose = PETSC_TRUE;
4530         (*C)->transupdated = PETSC_TRUE;
4531         Ccusp->rowoffsets_gpu = NULL;
4532         CmatT->cprowIndices = NULL;
4533         CmatT->mat = CcsrT;
4534         CcsrT->num_rows = n;
4535         CcsrT->num_cols = m;
4536         CcsrT->num_entries = c->nz;
4537 
4538         CcsrT->row_offsets = new THRUSTINTARRAY32(n+1);
4539         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4540         CcsrT->values = new THRUSTARRAY(c->nz);
4541 
4542         CHKERRQ(PetscLogGpuTimeBegin());
4543         auto rT = CcsrT->row_offsets->begin();
4544         if (AT) {
4545           rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT);
4546           thrust::advance(rT,-1);
4547         }
4548         if (BT) {
4549           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz));
4550           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz));
4551           thrust::copy(titb,tite,rT);
4552         }
4553         auto cT = CcsrT->column_indices->begin();
4554         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT);
4555         if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT);
4556         auto vT = CcsrT->values->begin();
4557         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4558         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4559         CHKERRQ(PetscLogGpuTimeEnd());
4560 
4561         CHKERRCUSPARSE(cusparseCreateMatDescr(&CmatT->descr));
4562         CHKERRCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO));
4563         CHKERRCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4564         CHKERRCUDA(cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar)));
4565         CHKERRCUDA(cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar)));
4566         CHKERRCUDA(cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar)));
4567         CHKERRCUDA(cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
4568         CHKERRCUDA(cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice));
4569         CHKERRCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
4570 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4571         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries,
4572                                  CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(),
4573                                  CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4574                                  CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4575 #endif
4576         Ccusp->matTranspose = CmatT;
4577       }
4578     }
4579 
4580     c->singlemalloc = PETSC_FALSE;
4581     c->free_a       = PETSC_TRUE;
4582     c->free_ij      = PETSC_TRUE;
4583     CHKERRQ(PetscMalloc1(m+1,&c->i));
4584     CHKERRQ(PetscMalloc1(c->nz,&c->j));
4585     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
4586       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4587       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4588       ii   = *Ccsr->row_offsets;
4589       jj   = *Ccsr->column_indices;
4590       CHKERRCUDA(cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
4591       CHKERRCUDA(cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
4592     } else {
4593       CHKERRCUDA(cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
4594       CHKERRCUDA(cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
4595     }
4596     CHKERRQ(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt)));
4597     CHKERRQ(PetscMalloc1(m,&c->ilen));
4598     CHKERRQ(PetscMalloc1(m,&c->imax));
4599     c->maxnz = c->nz;
4600     c->nonzerorowcnt = 0;
4601     c->rmax = 0;
4602     for (i = 0; i < m; i++) {
4603       const PetscInt nn = c->i[i+1] - c->i[i];
4604       c->ilen[i] = c->imax[i] = nn;
4605       c->nonzerorowcnt += (PetscInt)!!nn;
4606       c->rmax = PetscMax(c->rmax,nn);
4607     }
4608     CHKERRQ(MatMarkDiagonal_SeqAIJ(*C));
4609     CHKERRQ(PetscMalloc1(c->nz,&c->a));
4610     (*C)->nonzerostate++;
4611     CHKERRQ(PetscLayoutSetUp((*C)->rmap));
4612     CHKERRQ(PetscLayoutSetUp((*C)->cmap));
4613     Ccusp->nonzerostate = (*C)->nonzerostate;
4614     (*C)->preallocated  = PETSC_TRUE;
4615   } else {
4616     PetscCheckFalse((*C)->rmap->n != B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,(*C)->rmap->n,B->rmap->n);
4617     c = (Mat_SeqAIJ*)(*C)->data;
4618     if (c->nz) {
4619       Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4620       PetscCheck(Ccusp->cooPerm,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm");
4621       PetscCheckFalse(Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4622       PetscCheckFalse(Ccusp->nonzerostate != (*C)->nonzerostate,PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate");
4623       CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(A));
4624       CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(B));
4625       PetscCheck(Acusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4626       PetscCheck(Bcusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4627       Acsr = (CsrMatrix*)Acusp->mat->mat;
4628       Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4629       Ccsr = (CsrMatrix*)Ccusp->mat->mat;
4630       PetscCheckFalse(Acsr->num_entries != (PetscInt)Acsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %" PetscInt_FMT " != %" PetscInt_FMT,Acsr->num_entries,(PetscInt)Acsr->values->size());
4631       PetscCheckFalse(Bcsr->num_entries != (PetscInt)Bcsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %" PetscInt_FMT " != %" PetscInt_FMT,Bcsr->num_entries,(PetscInt)Bcsr->values->size());
4632       PetscCheckFalse(Ccsr->num_entries != (PetscInt)Ccsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %" PetscInt_FMT " != %" PetscInt_FMT,Ccsr->num_entries,(PetscInt)Ccsr->values->size());
4633       PetscCheckFalse(Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries,PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT,Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries);
4634       PetscCheck(Ccusp->cooPerm->size() == Ccsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %" PetscInt_FMT " != %" PetscInt_FMT,(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size());
4635       auto pmid = Ccusp->cooPerm->begin();
4636       thrust::advance(pmid,Acsr->num_entries);
4637       CHKERRQ(PetscLogGpuTimeBegin());
4638       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(),
4639                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin())));
4640       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(),
4641                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4642       thrust::for_each(zibait,zieait,VecCUDAEquals());
4643       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(),
4644                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4645       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(),
4646                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end())));
4647       thrust::for_each(zibbit,ziebit,VecCUDAEquals());
4648       CHKERRQ(MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE));
4649       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
4650         PetscCheck(Ccusp->matTranspose,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4651         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4652         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4653         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4654         CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat;
4655         auto vT = CcsrT->values->begin();
4656         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4657         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4658         (*C)->transupdated = PETSC_TRUE;
4659       }
4660       CHKERRQ(PetscLogGpuTimeEnd());
4661     }
4662   }
4663   CHKERRQ(PetscObjectStateIncrease((PetscObject)*C));
4664   (*C)->assembled     = PETSC_TRUE;
4665   (*C)->was_assembled = PETSC_FALSE;
4666   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4667   PetscFunctionReturn(0);
4668 }
4669 
4670 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4671 {
4672   bool              dmem;
4673   const PetscScalar *av;
4674 
4675   PetscFunctionBegin;
4676   dmem = isCudaMem(v);
4677   CHKERRQ(MatSeqAIJCUSPARSEGetArrayRead(A,&av));
4678   if (n && idx) {
4679     THRUSTINTARRAY widx(n);
4680     widx.assign(idx,idx+n);
4681     CHKERRQ(PetscLogCpuToGpu(n*sizeof(PetscInt)));
4682 
4683     THRUSTARRAY *w = NULL;
4684     thrust::device_ptr<PetscScalar> dv;
4685     if (dmem) {
4686       dv = thrust::device_pointer_cast(v);
4687     } else {
4688       w = new THRUSTARRAY(n);
4689       dv = w->data();
4690     }
4691     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
4692 
4693     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv));
4694     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n));
4695     thrust::for_each(zibit,zieit,VecCUDAEquals());
4696     if (w) {
4697       CHKERRCUDA(cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost));
4698     }
4699     delete w;
4700   } else {
4701     CHKERRCUDA(cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost));
4702   }
4703   if (!dmem) CHKERRQ(PetscLogCpuToGpu(n*sizeof(PetscScalar)));
4704   CHKERRQ(MatSeqAIJCUSPARSERestoreArrayRead(A,&av));
4705   PetscFunctionReturn(0);
4706 }
4707