xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision b122ec5aa1bd4469eb4e0673542fb7de3f411254)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
7 
8 #include <petscconf.h>
9 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
10 #include <../src/mat/impls/sbaij/seq/sbaij.h>
11 #include <../src/vec/vec/impls/dvecimpl.h>
12 #include <petsc/private/vecimpl.h>
13 #undef VecType
14 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
15 #include <thrust/adjacent_difference.h>
16 #include <thrust/async/for_each.h>
17 #include <thrust/iterator/constant_iterator.h>
18 #include <thrust/remove.h>
19 #include <thrust/sort.h>
20 #include <thrust/unique.h>
21 
22 const char *const MatCUSPARSEStorageFormats[]    = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
23 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
24   /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
25     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
26 
27   typedef enum {
28       CUSPARSE_MV_ALG_DEFAULT = 0,
29       CUSPARSE_COOMV_ALG      = 1,
30       CUSPARSE_CSRMV_ALG1     = 2,
31       CUSPARSE_CSRMV_ALG2     = 3
32   } cusparseSpMVAlg_t;
33 
34   typedef enum {
35       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
36       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
37       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
38       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
39       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
40       CUSPARSE_SPMM_ALG_DEFAULT = 0,
41       CUSPARSE_SPMM_COO_ALG1    = 1,
42       CUSPARSE_SPMM_COO_ALG2    = 2,
43       CUSPARSE_SPMM_COO_ALG3    = 3,
44       CUSPARSE_SPMM_COO_ALG4    = 5,
45       CUSPARSE_SPMM_CSR_ALG1    = 4,
46       CUSPARSE_SPMM_CSR_ALG2    = 6,
47   } cusparseSpMMAlg_t;
48 
49   typedef enum {
50       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
51       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
52   } cusparseCsr2CscAlg_t;
53   */
54   const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
55   const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
56   const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
57 #endif
58 
59 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
60 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
61 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
62 
63 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
64 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
65 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
66 
67 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
68 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
69 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
70 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
71 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
72 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure);
73 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar);
74 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
75 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
76 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
77 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
78 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
79 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
80 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
81 
82 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
83 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
84 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
85 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
86 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
87 
88 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
89 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool);
90 
91 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]);
92 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscCount,const PetscInt[],const PetscInt[]);
93 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode);
94 
95 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
96 {
97   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
98 
99   PetscFunctionBegin;
100   PetscCheck(cusparsestruct,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
101   cusparsestruct->stream = stream;
102   CHKERRCUSPARSE(cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream));
103   PetscFunctionReturn(0);
104 }
105 
106 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
107 {
108   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
109 
110   PetscFunctionBegin;
111   PetscCheck(cusparsestruct,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
112   if (cusparsestruct->handle != handle) {
113     if (cusparsestruct->handle) {
114       CHKERRCUSPARSE(cusparseDestroy(cusparsestruct->handle));
115     }
116     cusparsestruct->handle = handle;
117   }
118   CHKERRCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE));
119   PetscFunctionReturn(0);
120 }
121 
122 PetscErrorCode MatCUSPARSEClearHandle(Mat A)
123 {
124   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
125   PetscBool          flg;
126 
127   PetscFunctionBegin;
128   CHKERRQ(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
129   if (!flg || !cusparsestruct) PetscFunctionReturn(0);
130   if (cusparsestruct->handle) cusparsestruct->handle = 0;
131   PetscFunctionReturn(0);
132 }
133 
134 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
135 {
136   PetscFunctionBegin;
137   *type = MATSOLVERCUSPARSE;
138   PetscFunctionReturn(0);
139 }
140 
141 /*MC
142   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
143   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
144   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
145   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
146   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
147   algorithms are not recommended. This class does NOT support direct solver operations.
148 
149   Level: beginner
150 
151 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
152 M*/
153 
154 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
155 {
156   PetscInt       n = A->rmap->n;
157 
158   PetscFunctionBegin;
159   CHKERRQ(MatCreate(PetscObjectComm((PetscObject)A),B));
160   CHKERRQ(MatSetSizes(*B,n,n,n,n));
161   (*B)->factortype = ftype;
162   CHKERRQ(MatSetType(*B,MATSEQAIJCUSPARSE));
163 
164   if (A->boundtocpu && A->bindingpropagates) CHKERRQ(MatBindToCPU(*B,PETSC_TRUE));
165   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
166     CHKERRQ(MatSetBlockSizesFromMats(*B,A,A));
167     if (!A->boundtocpu) {
168       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
169       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
170     } else {
171       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
172       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJ;
173     }
174     CHKERRQ(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]));
175     CHKERRQ(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU]));
176     CHKERRQ(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
177   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
178     if (!A->boundtocpu) {
179       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
180       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
181     } else {
182       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJ;
183       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
184     }
185     CHKERRQ(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
186     CHKERRQ(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC]));
187   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
188 
189   CHKERRQ(MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL));
190   (*B)->canuseordering = PETSC_TRUE;
191   CHKERRQ(PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse));
192   PetscFunctionReturn(0);
193 }
194 
195 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
196 {
197   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
198 
199   PetscFunctionBegin;
200   switch (op) {
201   case MAT_CUSPARSE_MULT:
202     cusparsestruct->format = format;
203     break;
204   case MAT_CUSPARSE_ALL:
205     cusparsestruct->format = format;
206     break;
207   default:
208     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
209   }
210   PetscFunctionReturn(0);
211 }
212 
213 /*@
214    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
215    operation. Only the MatMult operation can use different GPU storage formats
216    for MPIAIJCUSPARSE matrices.
217    Not Collective
218 
219    Input Parameters:
220 +  A - Matrix of type SEQAIJCUSPARSE
221 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
222 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
223 
224    Output Parameter:
225 
226    Level: intermediate
227 
228 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
229 @*/
230 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
231 {
232   PetscFunctionBegin;
233   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
234   CHKERRQ(PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format)));
235   PetscFunctionReturn(0);
236 }
237 
238 PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A,PetscBool use_cpu)
239 {
240   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
241 
242   PetscFunctionBegin;
243   cusparsestruct->use_cpu_solve = use_cpu;
244   PetscFunctionReturn(0);
245 }
246 
247 /*@
248    MatCUSPARSESetUseCPUSolve - Sets use CPU MatSolve.
249 
250    Input Parameters:
251 +  A - Matrix of type SEQAIJCUSPARSE
252 -  use_cpu - set flag for using the built-in CPU MatSolve
253 
254    Output Parameter:
255 
256    Notes:
257    The cuSparse LU solver currently computes the factors with the built-in CPU method
258    and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there.
259    This method to specify if the solve is done on the CPU or GPU (GPU is the default).
260 
261    Level: intermediate
262 
263 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
264 @*/
265 PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A,PetscBool use_cpu)
266 {
267   PetscFunctionBegin;
268   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
269   CHKERRQ(PetscTryMethod(A,"MatCUSPARSESetUseCPUSolve_C",(Mat,PetscBool),(A,use_cpu)));
270   PetscFunctionReturn(0);
271 }
272 
273 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg)
274 {
275   PetscFunctionBegin;
276   switch (op) {
277     case MAT_FORM_EXPLICIT_TRANSPOSE:
278       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
279       if (A->form_explicit_transpose && !flg) CHKERRQ(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE));
280       A->form_explicit_transpose = flg;
281       break;
282     default:
283       CHKERRQ(MatSetOption_SeqAIJ(A,op,flg));
284       break;
285   }
286   PetscFunctionReturn(0);
287 }
288 
289 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A);
290 
291 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
292 {
293   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
294   IS             isrow = b->row,iscol = b->col;
295   PetscBool      row_identity,col_identity;
296   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)B->spptr;
297 
298   PetscFunctionBegin;
299   CHKERRQ(MatSeqAIJCUSPARSECopyFromGPU(A));
300   CHKERRQ(MatLUFactorNumeric_SeqAIJ(B,A,info));
301   B->offloadmask = PETSC_OFFLOAD_CPU;
302   /* determine which version of MatSolve needs to be used. */
303   CHKERRQ(ISIdentity(isrow,&row_identity));
304   CHKERRQ(ISIdentity(iscol,&col_identity));
305   if (row_identity && col_identity) {
306     if (!cusparsestruct->use_cpu_solve) {
307       B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
308       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
309     }
310     B->ops->matsolve = NULL;
311     B->ops->matsolvetranspose = NULL;
312   } else {
313     if (!cusparsestruct->use_cpu_solve) {
314       B->ops->solve = MatSolve_SeqAIJCUSPARSE;
315       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
316     }
317     B->ops->matsolve = NULL;
318     B->ops->matsolvetranspose = NULL;
319   }
320 
321   /* get the triangular factors */
322   if (!cusparsestruct->use_cpu_solve) {
323     CHKERRQ(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B));
324   }
325   PetscFunctionReturn(0);
326 }
327 
328 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
329 {
330   PetscErrorCode           ierr;
331   MatCUSPARSEStorageFormat format;
332   PetscBool                flg;
333   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
334 
335   PetscFunctionBegin;
336   CHKERRQ(PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options"));
337   if (A->factortype == MAT_FACTOR_NONE) {
338     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
339                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
340     if (flg) CHKERRQ(MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format));
341 
342     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
343                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
344     if (flg) CHKERRQ(MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format));
345     CHKERRQ(PetscOptionsBool("-mat_cusparse_use_cpu_solve","Use CPU (I)LU solve","MatCUSPARSESetUseCPUSolve",cusparsestruct->use_cpu_solve,&cusparsestruct->use_cpu_solve,&flg));
346     if (flg) CHKERRQ(MatCUSPARSESetUseCPUSolve(A,cusparsestruct->use_cpu_solve));
347 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
348     ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
349                             "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr);
350     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
351 #if PETSC_PKG_CUDA_VERSION_GE(11,2,0)
352     PetscCheckFalse(flg && CUSPARSE_SPMV_CSR_ALG1 != 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
353 #else
354     PetscCheckFalse(flg && CUSPARSE_CSRMV_ALG1 != 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
355 #endif
356     ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
357                             "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr);
358     PetscCheckFalse(flg && CUSPARSE_SPMM_CSR_ALG1 != 4,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
359 
360     ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
361                             "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr);
362     PetscCheckFalse(flg && CUSPARSE_CSR2CSC_ALG1 != 1,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
363    #endif
364   }
365   CHKERRQ(PetscOptionsTail());
366   PetscFunctionReturn(0);
367 }
368 
369 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
370 {
371   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
372 
373   PetscFunctionBegin;
374   CHKERRQ(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
375   CHKERRQ(MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info));
376   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
377   PetscFunctionReturn(0);
378 }
379 
380 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
381 {
382   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
383 
384   PetscFunctionBegin;
385   CHKERRQ(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
386   CHKERRQ(MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info));
387   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
388   PetscFunctionReturn(0);
389 }
390 
391 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
392 {
393   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
394 
395   PetscFunctionBegin;
396   CHKERRQ(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
397   CHKERRQ(MatICCFactorSymbolic_SeqAIJ(B,A,perm,info));
398   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
399   PetscFunctionReturn(0);
400 }
401 
402 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
403 {
404   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
405 
406   PetscFunctionBegin;
407   CHKERRQ(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
408   CHKERRQ(MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info));
409   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
410   PetscFunctionReturn(0);
411 }
412 
413 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
414 {
415   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
416   PetscInt                          n = A->rmap->n;
417   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
418   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
419   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
420   const MatScalar                   *aa = a->a,*v;
421   PetscInt                          *AiLo, *AjLo;
422   PetscInt                          i,nz, nzLower, offset, rowOffset;
423 
424   PetscFunctionBegin;
425   if (!n) PetscFunctionReturn(0);
426   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
427     try {
428       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
429       nzLower=n+ai[n]-ai[1];
430       if (!loTriFactor) {
431         PetscScalar                       *AALo;
432 
433         CHKERRCUDA(cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar)));
434 
435         /* Allocate Space for the lower triangular matrix */
436         CHKERRCUDA(cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt)));
437         CHKERRCUDA(cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt)));
438 
439         /* Fill the lower triangular matrix */
440         AiLo[0]  = (PetscInt) 0;
441         AiLo[n]  = nzLower;
442         AjLo[0]  = (PetscInt) 0;
443         AALo[0]  = (MatScalar) 1.0;
444         v        = aa;
445         vi       = aj;
446         offset   = 1;
447         rowOffset= 1;
448         for (i=1; i<n; i++) {
449           nz = ai[i+1] - ai[i];
450           /* additional 1 for the term on the diagonal */
451           AiLo[i]    = rowOffset;
452           rowOffset += nz+1;
453 
454           CHKERRQ(PetscArraycpy(&(AjLo[offset]), vi, nz));
455           CHKERRQ(PetscArraycpy(&(AALo[offset]), v, nz));
456 
457           offset      += nz;
458           AjLo[offset] = (PetscInt) i;
459           AALo[offset] = (MatScalar) 1.0;
460           offset      += 1;
461 
462           v  += nz;
463           vi += nz;
464         }
465 
466         /* allocate space for the triangular factor information */
467         CHKERRQ(PetscNew(&loTriFactor));
468         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
469         /* Create the matrix description */
470         CHKERRCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
471         CHKERRCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
472        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
473         CHKERRCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
474        #else
475         CHKERRCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
476        #endif
477         CHKERRCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER));
478         CHKERRCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
479 
480         /* set the operation */
481         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
482 
483         /* set the matrix */
484         loTriFactor->csrMat = new CsrMatrix;
485         loTriFactor->csrMat->num_rows = n;
486         loTriFactor->csrMat->num_cols = n;
487         loTriFactor->csrMat->num_entries = nzLower;
488 
489         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
490         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
491 
492         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
493         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
494 
495         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
496         loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
497 
498         /* Create the solve analysis information */
499         CHKERRQ(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
500         CHKERRCUSPARSE(cusparse_create_analysis_info(&loTriFactor->solveInfo));
501       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
502         CHKERRCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
503                                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
504                                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
505                                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
506                                                &loTriFactor->solveBufferSize));
507         CHKERRCUDA(cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize));
508       #endif
509 
510         /* perform the solve analysis */
511         CHKERRCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
512                                          loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
513                                          loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
514                                          loTriFactor->csrMat->column_indices->data().get(),
515                                          #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
516                                          loTriFactor->solveInfo,
517                                          loTriFactor->solvePolicy, loTriFactor->solveBuffer));
518                                          #else
519                                          loTriFactor->solveInfo));
520                                          #endif
521         CHKERRCUDA(WaitForCUDA());
522         CHKERRQ(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
523 
524         /* assign the pointer */
525         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
526         loTriFactor->AA_h = AALo;
527         CHKERRCUDA(cudaFreeHost(AiLo));
528         CHKERRCUDA(cudaFreeHost(AjLo));
529         CHKERRQ(PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar)));
530       } else { /* update values only */
531         if (!loTriFactor->AA_h) {
532           CHKERRCUDA(cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar)));
533         }
534         /* Fill the lower triangular matrix */
535         loTriFactor->AA_h[0]  = 1.0;
536         v        = aa;
537         vi       = aj;
538         offset   = 1;
539         for (i=1; i<n; i++) {
540           nz = ai[i+1] - ai[i];
541           CHKERRQ(PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz));
542           offset      += nz;
543           loTriFactor->AA_h[offset] = 1.0;
544           offset      += 1;
545           v  += nz;
546         }
547         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower);
548         CHKERRQ(PetscLogCpuToGpu(nzLower*sizeof(PetscScalar)));
549       }
550     } catch(char *ex) {
551       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
552     }
553   }
554   PetscFunctionReturn(0);
555 }
556 
557 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
558 {
559   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
560   PetscInt                          n = A->rmap->n;
561   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
562   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
563   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
564   const MatScalar                   *aa = a->a,*v;
565   PetscInt                          *AiUp, *AjUp;
566   PetscInt                          i,nz, nzUpper, offset;
567 
568   PetscFunctionBegin;
569   if (!n) PetscFunctionReturn(0);
570   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
571     try {
572       /* next, figure out the number of nonzeros in the upper triangular matrix. */
573       nzUpper = adiag[0]-adiag[n];
574       if (!upTriFactor) {
575         PetscScalar *AAUp;
576 
577         CHKERRCUDA(cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar)));
578 
579         /* Allocate Space for the upper triangular matrix */
580         CHKERRCUDA(cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt)));
581         CHKERRCUDA(cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt)));
582 
583         /* Fill the upper triangular matrix */
584         AiUp[0]=(PetscInt) 0;
585         AiUp[n]=nzUpper;
586         offset = nzUpper;
587         for (i=n-1; i>=0; i--) {
588           v  = aa + adiag[i+1] + 1;
589           vi = aj + adiag[i+1] + 1;
590 
591           /* number of elements NOT on the diagonal */
592           nz = adiag[i] - adiag[i+1]-1;
593 
594           /* decrement the offset */
595           offset -= (nz+1);
596 
597           /* first, set the diagonal elements */
598           AjUp[offset] = (PetscInt) i;
599           AAUp[offset] = (MatScalar)1./v[nz];
600           AiUp[i]      = AiUp[i+1] - (nz+1);
601 
602           CHKERRQ(PetscArraycpy(&(AjUp[offset+1]), vi, nz));
603           CHKERRQ(PetscArraycpy(&(AAUp[offset+1]), v, nz));
604         }
605 
606         /* allocate space for the triangular factor information */
607         CHKERRQ(PetscNew(&upTriFactor));
608         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
609 
610         /* Create the matrix description */
611         CHKERRCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
612         CHKERRCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
613        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
614         CHKERRCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
615        #else
616         CHKERRCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
617        #endif
618         CHKERRCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
619         CHKERRCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
620 
621         /* set the operation */
622         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
623 
624         /* set the matrix */
625         upTriFactor->csrMat = new CsrMatrix;
626         upTriFactor->csrMat->num_rows = n;
627         upTriFactor->csrMat->num_cols = n;
628         upTriFactor->csrMat->num_entries = nzUpper;
629 
630         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
631         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
632 
633         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
634         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
635 
636         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
637         upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
638 
639         /* Create the solve analysis information */
640         CHKERRQ(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
641         CHKERRCUSPARSE(cusparse_create_analysis_info(&upTriFactor->solveInfo));
642       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
643         CHKERRCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
644                                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
645                                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
646                                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
647                                                &upTriFactor->solveBufferSize));
648         CHKERRCUDA(cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize));
649       #endif
650 
651         /* perform the solve analysis */
652         CHKERRCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
653                                          upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
654                                          upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
655                                          upTriFactor->csrMat->column_indices->data().get(),
656                                          #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
657                                          upTriFactor->solveInfo,
658                                          upTriFactor->solvePolicy, upTriFactor->solveBuffer));
659                                          #else
660                                          upTriFactor->solveInfo));
661                                          #endif
662         CHKERRCUDA(WaitForCUDA());
663         CHKERRQ(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
664 
665         /* assign the pointer */
666         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
667         upTriFactor->AA_h = AAUp;
668         CHKERRCUDA(cudaFreeHost(AiUp));
669         CHKERRCUDA(cudaFreeHost(AjUp));
670         CHKERRQ(PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar)));
671       } else {
672         if (!upTriFactor->AA_h) {
673           CHKERRCUDA(cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar)));
674         }
675         /* Fill the upper triangular matrix */
676         offset = nzUpper;
677         for (i=n-1; i>=0; i--) {
678           v  = aa + adiag[i+1] + 1;
679 
680           /* number of elements NOT on the diagonal */
681           nz = adiag[i] - adiag[i+1]-1;
682 
683           /* decrement the offset */
684           offset -= (nz+1);
685 
686           /* first, set the diagonal elements */
687           upTriFactor->AA_h[offset] = 1./v[nz];
688           CHKERRQ(PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz));
689         }
690         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper);
691         CHKERRQ(PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar)));
692       }
693     } catch(char *ex) {
694       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
695     }
696   }
697   PetscFunctionReturn(0);
698 }
699 
700 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
701 {
702   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
703   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
704   IS                           isrow = a->row,iscol = a->icol;
705   PetscBool                    row_identity,col_identity;
706   PetscInt                     n = A->rmap->n;
707 
708   PetscFunctionBegin;
709   PetscCheck(cusparseTriFactors,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
710   CHKERRQ(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A));
711   CHKERRQ(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A));
712 
713   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
714   cusparseTriFactors->nnz=a->nz;
715 
716   A->offloadmask = PETSC_OFFLOAD_BOTH;
717   /* lower triangular indices */
718   CHKERRQ(ISIdentity(isrow,&row_identity));
719   if (!row_identity && !cusparseTriFactors->rpermIndices) {
720     const PetscInt *r;
721 
722     CHKERRQ(ISGetIndices(isrow,&r));
723     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
724     cusparseTriFactors->rpermIndices->assign(r, r+n);
725     CHKERRQ(ISRestoreIndices(isrow,&r));
726     CHKERRQ(PetscLogCpuToGpu(n*sizeof(PetscInt)));
727   }
728 
729   /* upper triangular indices */
730   CHKERRQ(ISIdentity(iscol,&col_identity));
731   if (!col_identity && !cusparseTriFactors->cpermIndices) {
732     const PetscInt *c;
733 
734     CHKERRQ(ISGetIndices(iscol,&c));
735     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
736     cusparseTriFactors->cpermIndices->assign(c, c+n);
737     CHKERRQ(ISRestoreIndices(iscol,&c));
738     CHKERRQ(PetscLogCpuToGpu(n*sizeof(PetscInt)));
739   }
740   PetscFunctionReturn(0);
741 }
742 
743 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
744 {
745   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
746   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
747   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
748   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
749   PetscInt                          *AiUp, *AjUp;
750   PetscScalar                       *AAUp;
751   PetscScalar                       *AALo;
752   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
753   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
754   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
755   const MatScalar                   *aa = b->a,*v;
756 
757   PetscFunctionBegin;
758   if (!n) PetscFunctionReturn(0);
759   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
760     try {
761       CHKERRCUDA(cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar)));
762       CHKERRCUDA(cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar)));
763       if (!upTriFactor && !loTriFactor) {
764         /* Allocate Space for the upper triangular matrix */
765         CHKERRCUDA(cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt)));
766         CHKERRCUDA(cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt)));
767 
768         /* Fill the upper triangular matrix */
769         AiUp[0]=(PetscInt) 0;
770         AiUp[n]=nzUpper;
771         offset = 0;
772         for (i=0; i<n; i++) {
773           /* set the pointers */
774           v  = aa + ai[i];
775           vj = aj + ai[i];
776           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
777 
778           /* first, set the diagonal elements */
779           AjUp[offset] = (PetscInt) i;
780           AAUp[offset] = (MatScalar)1.0/v[nz];
781           AiUp[i]      = offset;
782           AALo[offset] = (MatScalar)1.0/v[nz];
783 
784           offset+=1;
785           if (nz>0) {
786             CHKERRQ(PetscArraycpy(&(AjUp[offset]), vj, nz));
787             CHKERRQ(PetscArraycpy(&(AAUp[offset]), v, nz));
788             for (j=offset; j<offset+nz; j++) {
789               AAUp[j] = -AAUp[j];
790               AALo[j] = AAUp[j]/v[nz];
791             }
792             offset+=nz;
793           }
794         }
795 
796         /* allocate space for the triangular factor information */
797         CHKERRQ(PetscNew(&upTriFactor));
798         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
799 
800         /* Create the matrix description */
801         CHKERRCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
802         CHKERRCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
803        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
804         CHKERRCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
805        #else
806         CHKERRCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
807        #endif
808         CHKERRCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
809         CHKERRCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
810 
811         /* set the matrix */
812         upTriFactor->csrMat = new CsrMatrix;
813         upTriFactor->csrMat->num_rows = A->rmap->n;
814         upTriFactor->csrMat->num_cols = A->cmap->n;
815         upTriFactor->csrMat->num_entries = a->nz;
816 
817         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
818         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
819 
820         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
821         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
822 
823         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
824         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
825 
826         /* set the operation */
827         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
828 
829         /* Create the solve analysis information */
830         CHKERRQ(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
831         CHKERRCUSPARSE(cusparse_create_analysis_info(&upTriFactor->solveInfo));
832       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
833         CHKERRCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
834                                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
835                                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
836                                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
837                                                &upTriFactor->solveBufferSize));
838         CHKERRCUDA(cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize));
839       #endif
840 
841         /* perform the solve analysis */
842         CHKERRCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
843                                          upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
844                                          upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
845                                          upTriFactor->csrMat->column_indices->data().get(),
846                                          #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
847                                          upTriFactor->solveInfo,
848                                          upTriFactor->solvePolicy, upTriFactor->solveBuffer));
849                                          #else
850                                          upTriFactor->solveInfo));
851                                          #endif
852         CHKERRCUDA(WaitForCUDA());
853         CHKERRQ(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
854 
855         /* assign the pointer */
856         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
857 
858         /* allocate space for the triangular factor information */
859         CHKERRQ(PetscNew(&loTriFactor));
860         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
861 
862         /* Create the matrix description */
863         CHKERRCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
864         CHKERRCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
865        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
866         CHKERRCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
867        #else
868         CHKERRCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
869        #endif
870         CHKERRCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
871         CHKERRCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
872 
873         /* set the operation */
874         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
875 
876         /* set the matrix */
877         loTriFactor->csrMat = new CsrMatrix;
878         loTriFactor->csrMat->num_rows = A->rmap->n;
879         loTriFactor->csrMat->num_cols = A->cmap->n;
880         loTriFactor->csrMat->num_entries = a->nz;
881 
882         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
883         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
884 
885         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
886         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
887 
888         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
889         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
890 
891         /* Create the solve analysis information */
892         CHKERRQ(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
893         CHKERRCUSPARSE(cusparse_create_analysis_info(&loTriFactor->solveInfo));
894       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
895         CHKERRCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
896                                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
897                                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
898                                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
899                                                &loTriFactor->solveBufferSize));
900         CHKERRCUDA(cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize));
901       #endif
902 
903         /* perform the solve analysis */
904         CHKERRCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
905                                          loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
906                                          loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
907                                          loTriFactor->csrMat->column_indices->data().get(),
908                                          #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
909                                          loTriFactor->solveInfo,
910                                          loTriFactor->solvePolicy, loTriFactor->solveBuffer));
911                                          #else
912                                          loTriFactor->solveInfo));
913                                          #endif
914         CHKERRCUDA(WaitForCUDA());
915         CHKERRQ(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
916 
917         /* assign the pointer */
918         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
919 
920         CHKERRQ(PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar))));
921         CHKERRCUDA(cudaFreeHost(AiUp));
922         CHKERRCUDA(cudaFreeHost(AjUp));
923       } else {
924         /* Fill the upper triangular matrix */
925         offset = 0;
926         for (i=0; i<n; i++) {
927           /* set the pointers */
928           v  = aa + ai[i];
929           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
930 
931           /* first, set the diagonal elements */
932           AAUp[offset] = 1.0/v[nz];
933           AALo[offset] = 1.0/v[nz];
934 
935           offset+=1;
936           if (nz>0) {
937             CHKERRQ(PetscArraycpy(&(AAUp[offset]), v, nz));
938             for (j=offset; j<offset+nz; j++) {
939               AAUp[j] = -AAUp[j];
940               AALo[j] = AAUp[j]/v[nz];
941             }
942             offset+=nz;
943           }
944         }
945         PetscCheck(upTriFactor,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
946         PetscCheck(loTriFactor,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
947         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
948         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
949         CHKERRQ(PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar)));
950       }
951       CHKERRCUDA(cudaFreeHost(AAUp));
952       CHKERRCUDA(cudaFreeHost(AALo));
953     } catch(char *ex) {
954       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
955     }
956   }
957   PetscFunctionReturn(0);
958 }
959 
960 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
961 {
962   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
963   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
964   IS                           ip = a->row;
965   PetscBool                    perm_identity;
966   PetscInt                     n = A->rmap->n;
967 
968   PetscFunctionBegin;
969   PetscCheck(cusparseTriFactors,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
970   CHKERRQ(MatSeqAIJCUSPARSEBuildICCTriMatrices(A));
971   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
972   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
973 
974   A->offloadmask = PETSC_OFFLOAD_BOTH;
975 
976   /* lower triangular indices */
977   CHKERRQ(ISIdentity(ip,&perm_identity));
978   if (!perm_identity) {
979     IS             iip;
980     const PetscInt *irip,*rip;
981 
982     CHKERRQ(ISInvertPermutation(ip,PETSC_DECIDE,&iip));
983     CHKERRQ(ISGetIndices(iip,&irip));
984     CHKERRQ(ISGetIndices(ip,&rip));
985     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
986     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
987     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
988     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
989     CHKERRQ(ISRestoreIndices(iip,&irip));
990     CHKERRQ(ISDestroy(&iip));
991     CHKERRQ(ISRestoreIndices(ip,&rip));
992     CHKERRQ(PetscLogCpuToGpu(2.*n*sizeof(PetscInt)));
993   }
994   PetscFunctionReturn(0);
995 }
996 
997 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
998 {
999   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
1000   IS             ip = b->row;
1001   PetscBool      perm_identity;
1002 
1003   PetscFunctionBegin;
1004   CHKERRQ(MatSeqAIJCUSPARSECopyFromGPU(A));
1005   CHKERRQ(MatCholeskyFactorNumeric_SeqAIJ(B,A,info));
1006   B->offloadmask = PETSC_OFFLOAD_CPU;
1007   /* determine which version of MatSolve needs to be used. */
1008   CHKERRQ(ISIdentity(ip,&perm_identity));
1009   if (perm_identity) {
1010     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
1011     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
1012     B->ops->matsolve = NULL;
1013     B->ops->matsolvetranspose = NULL;
1014   } else {
1015     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
1016     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
1017     B->ops->matsolve = NULL;
1018     B->ops->matsolvetranspose = NULL;
1019   }
1020 
1021   /* get the triangular factors */
1022   CHKERRQ(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B));
1023   PetscFunctionReturn(0);
1024 }
1025 
1026 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
1027 {
1028   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1029   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1030   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1031   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
1032   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
1033   cusparseIndexBase_t               indexBase;
1034   cusparseMatrixType_t              matrixType;
1035   cusparseFillMode_t                fillMode;
1036   cusparseDiagType_t                diagType;
1037 
1038   PetscFunctionBegin;
1039   /* allocate space for the transpose of the lower triangular factor */
1040   CHKERRQ(PetscNew(&loTriFactorT));
1041   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1042 
1043   /* set the matrix descriptors of the lower triangular factor */
1044   matrixType = cusparseGetMatType(loTriFactor->descr);
1045   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
1046   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1047     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1048   diagType = cusparseGetMatDiagType(loTriFactor->descr);
1049 
1050   /* Create the matrix description */
1051   CHKERRCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr));
1052   CHKERRCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase));
1053   CHKERRCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType));
1054   CHKERRCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode));
1055   CHKERRCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType));
1056 
1057   /* set the operation */
1058   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1059 
1060   /* allocate GPU space for the CSC of the lower triangular factor*/
1061   loTriFactorT->csrMat = new CsrMatrix;
1062   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1063   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1064   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1065   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
1066   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1067   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1068 
1069   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1070 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1071   CHKERRCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1072                                                loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1073                                                loTriFactor->csrMat->values->data().get(),
1074                                                loTriFactor->csrMat->row_offsets->data().get(),
1075                                                loTriFactor->csrMat->column_indices->data().get(),
1076                                                loTriFactorT->csrMat->values->data().get(),
1077                                                loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1078                                                CUSPARSE_ACTION_NUMERIC,indexBase,
1079                                                CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize));
1080   CHKERRCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize));
1081 #endif
1082 
1083   CHKERRQ(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
1084   CHKERRCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1085                                   loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1086                                   loTriFactor->csrMat->values->data().get(),
1087                                   loTriFactor->csrMat->row_offsets->data().get(),
1088                                   loTriFactor->csrMat->column_indices->data().get(),
1089                                   loTriFactorT->csrMat->values->data().get(),
1090                                   #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1091                                   loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1092                                   CUSPARSE_ACTION_NUMERIC, indexBase,
1093                                   CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer));
1094                                   #else
1095                                   loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1096                                   CUSPARSE_ACTION_NUMERIC, indexBase));
1097                                   #endif
1098   CHKERRCUDA(WaitForCUDA());
1099   CHKERRQ(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
1100 
1101   /* Create the solve analysis information */
1102   CHKERRQ(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
1103   CHKERRCUSPARSE(cusparse_create_analysis_info(&loTriFactorT->solveInfo));
1104 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1105   CHKERRCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
1106                                          loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1107                                          loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1108                                          loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
1109                                          &loTriFactorT->solveBufferSize));
1110   CHKERRCUDA(cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize));
1111 #endif
1112 
1113   /* perform the solve analysis */
1114   CHKERRCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
1115                                    loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1116                                    loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1117                                    loTriFactorT->csrMat->column_indices->data().get(),
1118                                    #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1119                                    loTriFactorT->solveInfo,
1120                                    loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1121                                    #else
1122                                    loTriFactorT->solveInfo));
1123                                    #endif
1124   CHKERRCUDA(WaitForCUDA());
1125   CHKERRQ(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
1126 
1127   /* assign the pointer */
1128   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1129 
1130   /*********************************************/
1131   /* Now the Transpose of the Upper Tri Factor */
1132   /*********************************************/
1133 
1134   /* allocate space for the transpose of the upper triangular factor */
1135   CHKERRQ(PetscNew(&upTriFactorT));
1136   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1137 
1138   /* set the matrix descriptors of the upper triangular factor */
1139   matrixType = cusparseGetMatType(upTriFactor->descr);
1140   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1141   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1142     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1143   diagType = cusparseGetMatDiagType(upTriFactor->descr);
1144 
1145   /* Create the matrix description */
1146   CHKERRCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr));
1147   CHKERRCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase));
1148   CHKERRCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType));
1149   CHKERRCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode));
1150   CHKERRCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType));
1151 
1152   /* set the operation */
1153   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1154 
1155   /* allocate GPU space for the CSC of the upper triangular factor*/
1156   upTriFactorT->csrMat = new CsrMatrix;
1157   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1158   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1159   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1160   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1161   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1162   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1163 
1164   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1165 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1166   CHKERRCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1167                                                upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1168                                                upTriFactor->csrMat->values->data().get(),
1169                                                upTriFactor->csrMat->row_offsets->data().get(),
1170                                                upTriFactor->csrMat->column_indices->data().get(),
1171                                                upTriFactorT->csrMat->values->data().get(),
1172                                                upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1173                                                CUSPARSE_ACTION_NUMERIC,indexBase,
1174                                                CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize));
1175   CHKERRCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize));
1176 #endif
1177 
1178   CHKERRQ(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
1179   CHKERRCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1180                                   upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1181                                   upTriFactor->csrMat->values->data().get(),
1182                                   upTriFactor->csrMat->row_offsets->data().get(),
1183                                   upTriFactor->csrMat->column_indices->data().get(),
1184                                   upTriFactorT->csrMat->values->data().get(),
1185                                   #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1186                                   upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1187                                   CUSPARSE_ACTION_NUMERIC, indexBase,
1188                                   CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer));
1189                                   #else
1190                                   upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1191                                  CUSPARSE_ACTION_NUMERIC, indexBase));
1192                                  #endif
1193 
1194   CHKERRCUDA(WaitForCUDA());
1195   CHKERRQ(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
1196 
1197   /* Create the solve analysis information */
1198   CHKERRQ(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
1199   CHKERRCUSPARSE(cusparse_create_analysis_info(&upTriFactorT->solveInfo));
1200   #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1201   CHKERRCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1202                                          upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1203                                          upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1204                                          upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
1205                                          &upTriFactorT->solveBufferSize));
1206   CHKERRCUDA(cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize));
1207   #endif
1208 
1209   /* perform the solve analysis */
1210   /* christ, would it have killed you to put this stuff in a function????????? */
1211   CHKERRCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1212                                    upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1213                                    upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1214                                    upTriFactorT->csrMat->column_indices->data().get(),
1215                                    #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1216                                    upTriFactorT->solveInfo,
1217                                    upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1218                                    #else
1219                                    upTriFactorT->solveInfo));
1220                                    #endif
1221 
1222   CHKERRCUDA(WaitForCUDA());
1223   CHKERRQ(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
1224 
1225   /* assign the pointer */
1226   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1227   PetscFunctionReturn(0);
1228 }
1229 
1230 struct PetscScalarToPetscInt
1231 {
1232   __host__ __device__
1233   PetscInt operator()(PetscScalar s)
1234   {
1235     return (PetscInt)PetscRealPart(s);
1236   }
1237 };
1238 
1239 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A)
1240 {
1241   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1242   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1243   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1244   cusparseStatus_t             stat;
1245   cusparseIndexBase_t          indexBase;
1246 
1247   PetscFunctionBegin;
1248   CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(A));
1249   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1250   PetscCheck(matstruct,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing mat struct");
1251   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1252   PetscCheckFalse(A->transupdated && !matstructT,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing matTranspose struct");
1253   if (A->transupdated) PetscFunctionReturn(0);
1254   CHKERRQ(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
1255   CHKERRQ(PetscLogGpuTimeBegin());
1256   if (cusparsestruct->format != MAT_CUSPARSE_CSR) {
1257     CHKERRQ(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE));
1258   }
1259   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1260     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1261     CHKERRCUSPARSE(cusparseCreateMatDescr(&matstructT->descr));
1262     indexBase = cusparseGetMatIndexBase(matstruct->descr);
1263     CHKERRCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase));
1264     CHKERRCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
1265 
1266     /* set alpha and beta */
1267     CHKERRCUDA(cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar)));
1268     CHKERRCUDA(cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar)));
1269     CHKERRCUDA(cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar)));
1270     CHKERRCUDA(cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
1271     CHKERRCUDA(cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice));
1272     CHKERRCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
1273 
1274     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1275       CsrMatrix *matrixT = new CsrMatrix;
1276       matstructT->mat = matrixT;
1277       matrixT->num_rows = A->cmap->n;
1278       matrixT->num_cols = A->rmap->n;
1279       matrixT->num_entries = a->nz;
1280       matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1281       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1282       matrixT->values = new THRUSTARRAY(a->nz);
1283 
1284       if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); }
1285       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
1286 
1287      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1288       #if PETSC_PKG_CUDA_VERSION_GE(11,2,1)
1289         stat = cusparseCreateCsr(&matstructT->matDescr,
1290                                matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1291                                matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1292                                matrixT->values->data().get(),
1293                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1294                                indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
1295       #else
1296         /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1,
1297            see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1
1298 
1299            I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set
1300            it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2,
1301            when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly.
1302         */
1303         if (matrixT->num_entries) {
1304           stat = cusparseCreateCsr(&matstructT->matDescr,
1305                                  matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1306                                  matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1307                                  matrixT->values->data().get(),
1308                                  CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I,
1309                                  indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
1310 
1311         } else {
1312           matstructT->matDescr = NULL;
1313           matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
1314         }
1315       #endif
1316      #endif
1317     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1318    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1319       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1320    #else
1321       CsrMatrix *temp  = new CsrMatrix;
1322       CsrMatrix *tempT = new CsrMatrix;
1323       /* First convert HYB to CSR */
1324       temp->num_rows = A->rmap->n;
1325       temp->num_cols = A->cmap->n;
1326       temp->num_entries = a->nz;
1327       temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1328       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1329       temp->values = new THRUSTARRAY(a->nz);
1330 
1331       stat = cusparse_hyb2csr(cusparsestruct->handle,
1332                               matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1333                               temp->values->data().get(),
1334                               temp->row_offsets->data().get(),
1335                               temp->column_indices->data().get());CHKERRCUSPARSE(stat);
1336 
1337       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1338       tempT->num_rows = A->rmap->n;
1339       tempT->num_cols = A->cmap->n;
1340       tempT->num_entries = a->nz;
1341       tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1342       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1343       tempT->values = new THRUSTARRAY(a->nz);
1344 
1345       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1346                               temp->num_cols, temp->num_entries,
1347                               temp->values->data().get(),
1348                               temp->row_offsets->data().get(),
1349                               temp->column_indices->data().get(),
1350                               tempT->values->data().get(),
1351                               tempT->column_indices->data().get(),
1352                               tempT->row_offsets->data().get(),
1353                               CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1354 
1355       /* Last, convert CSC to HYB */
1356       cusparseHybMat_t hybMat;
1357       CHKERRCUSPARSE(cusparseCreateHybMat(&hybMat));
1358       cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1359         CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1360       stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1361                               matstructT->descr, tempT->values->data().get(),
1362                               tempT->row_offsets->data().get(),
1363                               tempT->column_indices->data().get(),
1364                               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1365 
1366       /* assign the pointer */
1367       matstructT->mat = hybMat;
1368       A->transupdated = PETSC_TRUE;
1369       /* delete temporaries */
1370       if (tempT) {
1371         if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1372         if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1373         if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1374         delete (CsrMatrix*) tempT;
1375       }
1376       if (temp) {
1377         if (temp->values) delete (THRUSTARRAY*) temp->values;
1378         if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1379         if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1380         delete (CsrMatrix*) temp;
1381       }
1382      #endif
1383     }
1384   }
1385   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1386     CsrMatrix *matrix  = (CsrMatrix*)matstruct->mat;
1387     CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat;
1388     PetscCheck(matrix,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix");
1389     PetscCheck(matrix->row_offsets,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix rows");
1390     PetscCheck(matrix->column_indices,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix cols");
1391     PetscCheck(matrix->values,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix values");
1392     PetscCheck(matrixT,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT");
1393     PetscCheck(matrixT->row_offsets,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT rows");
1394     PetscCheck(matrixT->column_indices,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT cols");
1395     PetscCheck(matrixT->values,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT values");
1396     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1397       cusparsestruct->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
1398       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
1399       CHKERRQ(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt)));
1400     }
1401     if (!cusparsestruct->csr2csc_i) {
1402       THRUSTARRAY csr2csc_a(matrix->num_entries);
1403       PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1404 
1405       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1406      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1407       void   *csr2cscBuffer;
1408       size_t csr2cscBufferSize;
1409       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1410                                            A->cmap->n, matrix->num_entries,
1411                                            matrix->values->data().get(),
1412                                            cusparsestruct->rowoffsets_gpu->data().get(),
1413                                            matrix->column_indices->data().get(),
1414                                            matrixT->values->data().get(),
1415                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1416                                            CUSPARSE_ACTION_NUMERIC,indexBase,
1417                                            cusparsestruct->csr2cscAlg, &csr2cscBufferSize);CHKERRCUSPARSE(stat);
1418       CHKERRCUDA(cudaMalloc(&csr2cscBuffer,csr2cscBufferSize));
1419      #endif
1420 
1421       if (matrix->num_entries) {
1422         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
1423            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
1424            I checked every parameters and they were just fine. I have no clue why cusparse complains.
1425 
1426            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
1427            should be filled with indexBase. So I just take a shortcut here.
1428         */
1429         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1430                               A->cmap->n,matrix->num_entries,
1431                               csr2csc_a.data().get(),
1432                               cusparsestruct->rowoffsets_gpu->data().get(),
1433                               matrix->column_indices->data().get(),
1434                               matrixT->values->data().get(),
1435                              #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1436                               matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1437                               CUSPARSE_ACTION_NUMERIC,indexBase,
1438                               cusparsestruct->csr2cscAlg, csr2cscBuffer);CHKERRCUSPARSE(stat);
1439                              #else
1440                               matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1441                               CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1442                              #endif
1443       } else {
1444         matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
1445       }
1446 
1447       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1448       PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt()));
1449      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1450       CHKERRCUDA(cudaFree(csr2cscBuffer));
1451      #endif
1452     }
1453     PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()),
1454                                                      thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()),
1455                                                      matrixT->values->begin()));
1456   }
1457   CHKERRQ(PetscLogGpuTimeEnd());
1458   CHKERRQ(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
1459   /* the compressed row indices is not used for matTranspose */
1460   matstructT->cprowIndices = NULL;
1461   /* assign the pointer */
1462   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1463   A->transupdated = PETSC_TRUE;
1464   PetscFunctionReturn(0);
1465 }
1466 
1467 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1468 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1469 {
1470   PetscInt                              n = xx->map->n;
1471   const PetscScalar                     *barray;
1472   PetscScalar                           *xarray;
1473   thrust::device_ptr<const PetscScalar> bGPU;
1474   thrust::device_ptr<PetscScalar>       xGPU;
1475   cusparseStatus_t                      stat;
1476   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1477   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1478   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1479   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1480 
1481   PetscFunctionBegin;
1482   /* Analyze the matrix and create the transpose ... on the fly */
1483   if (!loTriFactorT && !upTriFactorT) {
1484     CHKERRQ(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1485     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1486     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1487   }
1488 
1489   /* Get the GPU pointers */
1490   CHKERRQ(VecCUDAGetArrayWrite(xx,&xarray));
1491   CHKERRQ(VecCUDAGetArrayRead(bb,&barray));
1492   xGPU = thrust::device_pointer_cast(xarray);
1493   bGPU = thrust::device_pointer_cast(barray);
1494 
1495   CHKERRQ(PetscLogGpuTimeBegin());
1496   /* First, reorder with the row permutation */
1497   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1498                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1499                xGPU);
1500 
1501   /* First, solve U */
1502   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1503                         upTriFactorT->csrMat->num_rows,
1504                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1505                         upTriFactorT->csrMat->num_entries,
1506                       #endif
1507                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1508                         upTriFactorT->csrMat->values->data().get(),
1509                         upTriFactorT->csrMat->row_offsets->data().get(),
1510                         upTriFactorT->csrMat->column_indices->data().get(),
1511                         upTriFactorT->solveInfo,
1512                         xarray,
1513                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1514                         tempGPU->data().get(),
1515                         upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1516                       #else
1517                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1518                       #endif
1519 
1520   /* Then, solve L */
1521   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1522                         loTriFactorT->csrMat->num_rows,
1523                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1524                         loTriFactorT->csrMat->num_entries,
1525                       #endif
1526                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1527                         loTriFactorT->csrMat->values->data().get(),
1528                         loTriFactorT->csrMat->row_offsets->data().get(),
1529                         loTriFactorT->csrMat->column_indices->data().get(),
1530                         loTriFactorT->solveInfo,
1531                         tempGPU->data().get(),
1532                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1533                         xarray,
1534                         loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1535                       #else
1536                          xarray);CHKERRCUSPARSE(stat);
1537                       #endif
1538 
1539   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1540   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1541                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1542                tempGPU->begin());
1543 
1544   /* Copy the temporary to the full solution. */
1545   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU);
1546 
1547   /* restore */
1548   CHKERRQ(VecCUDARestoreArrayRead(bb,&barray));
1549   CHKERRQ(VecCUDARestoreArrayWrite(xx,&xarray));
1550   CHKERRQ(PetscLogGpuTimeEnd());
1551   CHKERRQ(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n));
1552   PetscFunctionReturn(0);
1553 }
1554 
1555 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1556 {
1557   const PetscScalar                 *barray;
1558   PetscScalar                       *xarray;
1559   cusparseStatus_t                  stat;
1560   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1561   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1562   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1563   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1564 
1565   PetscFunctionBegin;
1566   /* Analyze the matrix and create the transpose ... on the fly */
1567   if (!loTriFactorT && !upTriFactorT) {
1568     CHKERRQ(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1569     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1570     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1571   }
1572 
1573   /* Get the GPU pointers */
1574   CHKERRQ(VecCUDAGetArrayWrite(xx,&xarray));
1575   CHKERRQ(VecCUDAGetArrayRead(bb,&barray));
1576 
1577   CHKERRQ(PetscLogGpuTimeBegin());
1578   /* First, solve U */
1579   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1580                         upTriFactorT->csrMat->num_rows,
1581                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1582                         upTriFactorT->csrMat->num_entries,
1583                       #endif
1584                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1585                         upTriFactorT->csrMat->values->data().get(),
1586                         upTriFactorT->csrMat->row_offsets->data().get(),
1587                         upTriFactorT->csrMat->column_indices->data().get(),
1588                         upTriFactorT->solveInfo,
1589                         barray,
1590                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1591                         tempGPU->data().get(),
1592                         upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1593                       #else
1594                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1595                       #endif
1596 
1597   /* Then, solve L */
1598   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1599                         loTriFactorT->csrMat->num_rows,
1600                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1601                         loTriFactorT->csrMat->num_entries,
1602                       #endif
1603                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1604                         loTriFactorT->csrMat->values->data().get(),
1605                         loTriFactorT->csrMat->row_offsets->data().get(),
1606                         loTriFactorT->csrMat->column_indices->data().get(),
1607                         loTriFactorT->solveInfo,
1608                         tempGPU->data().get(),
1609                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1610                         xarray,
1611                         loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1612                       #else
1613                         xarray);CHKERRCUSPARSE(stat);
1614                       #endif
1615 
1616   /* restore */
1617   CHKERRQ(VecCUDARestoreArrayRead(bb,&barray));
1618   CHKERRQ(VecCUDARestoreArrayWrite(xx,&xarray));
1619   CHKERRQ(PetscLogGpuTimeEnd());
1620   CHKERRQ(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n));
1621   PetscFunctionReturn(0);
1622 }
1623 
1624 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1625 {
1626   const PetscScalar                     *barray;
1627   PetscScalar                           *xarray;
1628   thrust::device_ptr<const PetscScalar> bGPU;
1629   thrust::device_ptr<PetscScalar>       xGPU;
1630   cusparseStatus_t                      stat;
1631   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1632   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1633   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1634   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1635 
1636   PetscFunctionBegin;
1637 
1638   /* Get the GPU pointers */
1639   CHKERRQ(VecCUDAGetArrayWrite(xx,&xarray));
1640   CHKERRQ(VecCUDAGetArrayRead(bb,&barray));
1641   xGPU = thrust::device_pointer_cast(xarray);
1642   bGPU = thrust::device_pointer_cast(barray);
1643 
1644   CHKERRQ(PetscLogGpuTimeBegin());
1645   /* First, reorder with the row permutation */
1646   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1647                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1648                tempGPU->begin());
1649 
1650   /* Next, solve L */
1651   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1652                         loTriFactor->csrMat->num_rows,
1653                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1654                         loTriFactor->csrMat->num_entries,
1655                       #endif
1656                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1657                         loTriFactor->csrMat->values->data().get(),
1658                         loTriFactor->csrMat->row_offsets->data().get(),
1659                         loTriFactor->csrMat->column_indices->data().get(),
1660                         loTriFactor->solveInfo,
1661                         tempGPU->data().get(),
1662                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1663                          xarray,
1664                          loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1665                       #else
1666                          xarray);CHKERRCUSPARSE(stat);
1667                       #endif
1668 
1669   /* Then, solve U */
1670   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1671                         upTriFactor->csrMat->num_rows,
1672                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1673                         upTriFactor->csrMat->num_entries,
1674                       #endif
1675                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1676                         upTriFactor->csrMat->values->data().get(),
1677                         upTriFactor->csrMat->row_offsets->data().get(),
1678                         upTriFactor->csrMat->column_indices->data().get(),
1679                         upTriFactor->solveInfo,xarray,
1680                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1681                         tempGPU->data().get(),
1682                         upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1683                       #else
1684                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1685                       #endif
1686 
1687   /* Last, reorder with the column permutation */
1688   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1689                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
1690                xGPU);
1691 
1692   CHKERRQ(VecCUDARestoreArrayRead(bb,&barray));
1693   CHKERRQ(VecCUDARestoreArrayWrite(xx,&xarray));
1694   CHKERRQ(PetscLogGpuTimeEnd());
1695   CHKERRQ(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n));
1696   PetscFunctionReturn(0);
1697 }
1698 
1699 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1700 {
1701   const PetscScalar                 *barray;
1702   PetscScalar                       *xarray;
1703   cusparseStatus_t                  stat;
1704   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1705   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1706   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1707   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1708 
1709   PetscFunctionBegin;
1710   /* Get the GPU pointers */
1711   CHKERRQ(VecCUDAGetArrayWrite(xx,&xarray));
1712   CHKERRQ(VecCUDAGetArrayRead(bb,&barray));
1713 
1714   CHKERRQ(PetscLogGpuTimeBegin());
1715   /* First, solve L */
1716   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1717                         loTriFactor->csrMat->num_rows,
1718                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1719                         loTriFactor->csrMat->num_entries,
1720                       #endif
1721                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1722                         loTriFactor->csrMat->values->data().get(),
1723                         loTriFactor->csrMat->row_offsets->data().get(),
1724                         loTriFactor->csrMat->column_indices->data().get(),
1725                         loTriFactor->solveInfo,
1726                         barray,
1727                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1728                         tempGPU->data().get(),
1729                         loTriFactor->solvePolicy,loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1730                       #else
1731                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1732                       #endif
1733 
1734   /* Next, solve U */
1735   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1736                         upTriFactor->csrMat->num_rows,
1737                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1738                         upTriFactor->csrMat->num_entries,
1739                       #endif
1740                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1741                         upTriFactor->csrMat->values->data().get(),
1742                         upTriFactor->csrMat->row_offsets->data().get(),
1743                         upTriFactor->csrMat->column_indices->data().get(),
1744                         upTriFactor->solveInfo,
1745                         tempGPU->data().get(),
1746                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1747                         xarray,
1748                         upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1749                       #else
1750                         xarray);CHKERRCUSPARSE(stat);
1751                       #endif
1752 
1753   CHKERRQ(VecCUDARestoreArrayRead(bb,&barray));
1754   CHKERRQ(VecCUDARestoreArrayWrite(xx,&xarray));
1755   CHKERRQ(PetscLogGpuTimeEnd());
1756   CHKERRQ(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n));
1757   PetscFunctionReturn(0);
1758 }
1759 
1760 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
1761 {
1762   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1763   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1764 
1765   PetscFunctionBegin;
1766   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
1767     CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat;
1768 
1769     CHKERRQ(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0));
1770     CHKERRCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost));
1771     CHKERRCUDA(WaitForCUDA());
1772     CHKERRQ(PetscLogGpuToCpu(a->nz*sizeof(PetscScalar)));
1773     CHKERRQ(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0));
1774     A->offloadmask = PETSC_OFFLOAD_BOTH;
1775   }
1776   PetscFunctionReturn(0);
1777 }
1778 
1779 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1780 {
1781   PetscFunctionBegin;
1782   CHKERRQ(MatSeqAIJCUSPARSECopyFromGPU(A));
1783   *array = ((Mat_SeqAIJ*)A->data)->a;
1784   PetscFunctionReturn(0);
1785 }
1786 
1787 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1788 {
1789   PetscFunctionBegin;
1790   A->offloadmask = PETSC_OFFLOAD_CPU;
1791   *array         = NULL;
1792   PetscFunctionReturn(0);
1793 }
1794 
1795 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[])
1796 {
1797   PetscFunctionBegin;
1798   CHKERRQ(MatSeqAIJCUSPARSECopyFromGPU(A));
1799   *array = ((Mat_SeqAIJ*)A->data)->a;
1800   PetscFunctionReturn(0);
1801 }
1802 
1803 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[])
1804 {
1805   PetscFunctionBegin;
1806   *array = NULL;
1807   PetscFunctionReturn(0);
1808 }
1809 
1810 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1811 {
1812   PetscFunctionBegin;
1813   *array = ((Mat_SeqAIJ*)A->data)->a;
1814   PetscFunctionReturn(0);
1815 }
1816 
1817 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1818 {
1819   PetscFunctionBegin;
1820   A->offloadmask = PETSC_OFFLOAD_CPU;
1821   *array         = NULL;
1822   PetscFunctionReturn(0);
1823 }
1824 
1825 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1826 {
1827   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1828   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
1829   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1830   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
1831   cusparseStatus_t             stat;
1832   PetscBool                    both = PETSC_TRUE;
1833 
1834   PetscFunctionBegin;
1835   PetscCheck(!A->boundtocpu,PETSC_COMM_SELF,PETSC_ERR_GPU,"Cannot copy to GPU");
1836   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1837     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
1838       CsrMatrix *matrix;
1839       matrix = (CsrMatrix*)cusparsestruct->mat->mat;
1840 
1841       PetscCheckFalse(a->nz && !a->a,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR values");
1842       CHKERRQ(PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0));
1843       matrix->values->assign(a->a, a->a+a->nz);
1844       CHKERRCUDA(WaitForCUDA());
1845       CHKERRQ(PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar)));
1846       CHKERRQ(PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0));
1847       CHKERRQ(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE));
1848     } else {
1849       PetscInt nnz;
1850       CHKERRQ(PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0));
1851       CHKERRQ(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format));
1852       CHKERRQ(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE));
1853       delete cusparsestruct->workVector;
1854       delete cusparsestruct->rowoffsets_gpu;
1855       cusparsestruct->workVector = NULL;
1856       cusparsestruct->rowoffsets_gpu = NULL;
1857       try {
1858         if (a->compressedrow.use) {
1859           m    = a->compressedrow.nrows;
1860           ii   = a->compressedrow.i;
1861           ridx = a->compressedrow.rindex;
1862         } else {
1863           m    = A->rmap->n;
1864           ii   = a->i;
1865           ridx = NULL;
1866         }
1867         PetscCheckFalse(!ii,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR row data");
1868         if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; }
1869         else nnz = a->nz;
1870         PetscCheckFalse(nnz && !a->j,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR column data");
1871 
1872         /* create cusparse matrix */
1873         cusparsestruct->nrows = m;
1874         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1875         CHKERRCUSPARSE(cusparseCreateMatDescr(&matstruct->descr));
1876         CHKERRCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO));
1877         CHKERRCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
1878 
1879         CHKERRCUDA(cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar)));
1880         CHKERRCUDA(cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar)));
1881         CHKERRCUDA(cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar)));
1882         CHKERRCUDA(cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
1883         CHKERRCUDA(cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice));
1884         CHKERRCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
1885         CHKERRCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE));
1886 
1887         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1888         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1889           /* set the matrix */
1890           CsrMatrix *mat= new CsrMatrix;
1891           mat->num_rows = m;
1892           mat->num_cols = A->cmap->n;
1893           mat->num_entries = nnz;
1894           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1895           mat->row_offsets->assign(ii, ii + m+1);
1896 
1897           mat->column_indices = new THRUSTINTARRAY32(nnz);
1898           mat->column_indices->assign(a->j, a->j+nnz);
1899 
1900           mat->values = new THRUSTARRAY(nnz);
1901           if (a->a) mat->values->assign(a->a, a->a+nnz);
1902 
1903           /* assign the pointer */
1904           matstruct->mat = mat;
1905          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1906           if (mat->num_rows) { /* cusparse errors on empty matrices! */
1907             stat = cusparseCreateCsr(&matstruct->matDescr,
1908                                     mat->num_rows, mat->num_cols, mat->num_entries,
1909                                     mat->row_offsets->data().get(), mat->column_indices->data().get(),
1910                                     mat->values->data().get(),
1911                                     CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1912                                     CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
1913           }
1914          #endif
1915         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1916          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1917           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1918          #else
1919           CsrMatrix *mat= new CsrMatrix;
1920           mat->num_rows = m;
1921           mat->num_cols = A->cmap->n;
1922           mat->num_entries = nnz;
1923           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1924           mat->row_offsets->assign(ii, ii + m+1);
1925 
1926           mat->column_indices = new THRUSTINTARRAY32(nnz);
1927           mat->column_indices->assign(a->j, a->j+nnz);
1928 
1929           mat->values = new THRUSTARRAY(nnz);
1930           if (a->a) mat->values->assign(a->a, a->a+nnz);
1931 
1932           cusparseHybMat_t hybMat;
1933           CHKERRCUSPARSE(cusparseCreateHybMat(&hybMat));
1934           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1935             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1936           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
1937               matstruct->descr, mat->values->data().get(),
1938               mat->row_offsets->data().get(),
1939               mat->column_indices->data().get(),
1940               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1941           /* assign the pointer */
1942           matstruct->mat = hybMat;
1943 
1944           if (mat) {
1945             if (mat->values) delete (THRUSTARRAY*)mat->values;
1946             if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1947             if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1948             delete (CsrMatrix*)mat;
1949           }
1950          #endif
1951         }
1952 
1953         /* assign the compressed row indices */
1954         if (a->compressedrow.use) {
1955           cusparsestruct->workVector = new THRUSTARRAY(m);
1956           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
1957           matstruct->cprowIndices->assign(ridx,ridx+m);
1958           tmp = m;
1959         } else {
1960           cusparsestruct->workVector = NULL;
1961           matstruct->cprowIndices    = NULL;
1962           tmp = 0;
1963         }
1964         CHKERRQ(PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar)));
1965 
1966         /* assign the pointer */
1967         cusparsestruct->mat = matstruct;
1968       } catch(char *ex) {
1969         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1970       }
1971       CHKERRCUDA(WaitForCUDA());
1972       CHKERRQ(PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0));
1973       cusparsestruct->nonzerostate = A->nonzerostate;
1974     }
1975     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
1976   }
1977   PetscFunctionReturn(0);
1978 }
1979 
1980 struct VecCUDAPlusEquals
1981 {
1982   template <typename Tuple>
1983   __host__ __device__
1984   void operator()(Tuple t)
1985   {
1986     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1987   }
1988 };
1989 
1990 struct VecCUDAEquals
1991 {
1992   template <typename Tuple>
1993   __host__ __device__
1994   void operator()(Tuple t)
1995   {
1996     thrust::get<1>(t) = thrust::get<0>(t);
1997   }
1998 };
1999 
2000 struct VecCUDAEqualsReverse
2001 {
2002   template <typename Tuple>
2003   __host__ __device__
2004   void operator()(Tuple t)
2005   {
2006     thrust::get<0>(t) = thrust::get<1>(t);
2007   }
2008 };
2009 
2010 struct MatMatCusparse {
2011   PetscBool             cisdense;
2012   PetscScalar           *Bt;
2013   Mat                   X;
2014   PetscBool             reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
2015   PetscLogDouble        flops;
2016   CsrMatrix             *Bcsr;
2017 
2018 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2019   cusparseSpMatDescr_t  matSpBDescr;
2020   PetscBool             initialized;   /* C = alpha op(A) op(B) + beta C */
2021   cusparseDnMatDescr_t  matBDescr;
2022   cusparseDnMatDescr_t  matCDescr;
2023   PetscInt              Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
2024  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2025   void                  *dBuffer4;
2026   void                  *dBuffer5;
2027  #endif
2028   size_t                mmBufferSize;
2029   void                  *mmBuffer;
2030   void                  *mmBuffer2; /* SpGEMM WorkEstimation buffer */
2031   cusparseSpGEMMDescr_t spgemmDesc;
2032 #endif
2033 };
2034 
2035 static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
2036 {
2037   MatMatCusparse *mmdata = (MatMatCusparse *)data;
2038 
2039   PetscFunctionBegin;
2040   CHKERRCUDA(cudaFree(mmdata->Bt));
2041   delete mmdata->Bcsr;
2042  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2043   if (mmdata->matSpBDescr) CHKERRCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr));
2044   if (mmdata->matBDescr)   CHKERRCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
2045   if (mmdata->matCDescr)   CHKERRCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
2046   if (mmdata->spgemmDesc)  CHKERRCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc));
2047  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2048   if (mmdata->dBuffer4)  CHKERRCUDA(cudaFree(mmdata->dBuffer4));
2049   if (mmdata->dBuffer5)  CHKERRCUDA(cudaFree(mmdata->dBuffer5));
2050  #endif
2051   if (mmdata->mmBuffer)  CHKERRCUDA(cudaFree(mmdata->mmBuffer));
2052   if (mmdata->mmBuffer2) CHKERRCUDA(cudaFree(mmdata->mmBuffer2));
2053  #endif
2054   CHKERRQ(MatDestroy(&mmdata->X));
2055   CHKERRQ(PetscFree(data));
2056   PetscFunctionReturn(0);
2057 }
2058 
2059 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
2060 
2061 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2062 {
2063   Mat_Product                  *product = C->product;
2064   Mat                          A,B;
2065   PetscInt                     m,n,blda,clda;
2066   PetscBool                    flg,biscuda;
2067   Mat_SeqAIJCUSPARSE           *cusp;
2068   cusparseStatus_t             stat;
2069   cusparseOperation_t          opA;
2070   const PetscScalar            *barray;
2071   PetscScalar                  *carray;
2072   MatMatCusparse               *mmdata;
2073   Mat_SeqAIJCUSPARSEMultStruct *mat;
2074   CsrMatrix                    *csrmat;
2075 
2076   PetscFunctionBegin;
2077   MatCheckProduct(C,1);
2078   PetscCheck(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
2079   mmdata = (MatMatCusparse*)product->data;
2080   A    = product->A;
2081   B    = product->B;
2082   CHKERRQ(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
2083   PetscCheck(flg,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2084   /* currently CopyToGpu does not copy if the matrix is bound to CPU
2085      Instead of silently accepting the wrong answer, I prefer to raise the error */
2086   PetscCheck(!A->boundtocpu,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2087   CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(A));
2088   cusp   = (Mat_SeqAIJCUSPARSE*)A->spptr;
2089   switch (product->type) {
2090   case MATPRODUCT_AB:
2091   case MATPRODUCT_PtAP:
2092     mat = cusp->mat;
2093     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2094     m   = A->rmap->n;
2095     n   = B->cmap->n;
2096     break;
2097   case MATPRODUCT_AtB:
2098     if (!A->form_explicit_transpose) {
2099       mat = cusp->mat;
2100       opA = CUSPARSE_OPERATION_TRANSPOSE;
2101     } else {
2102       CHKERRQ(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2103       mat  = cusp->matTranspose;
2104       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
2105     }
2106     m = A->cmap->n;
2107     n = B->cmap->n;
2108     break;
2109   case MATPRODUCT_ABt:
2110   case MATPRODUCT_RARt:
2111     mat = cusp->mat;
2112     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2113     m   = A->rmap->n;
2114     n   = B->rmap->n;
2115     break;
2116   default:
2117     SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2118   }
2119   PetscCheck(mat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing Mat_SeqAIJCUSPARSEMultStruct");
2120   csrmat = (CsrMatrix*)mat->mat;
2121   /* if the user passed a CPU matrix, copy the data to the GPU */
2122   CHKERRQ(PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda));
2123   if (!biscuda) CHKERRQ(MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B));
2124   CHKERRQ(MatDenseCUDAGetArrayRead(B,&barray));
2125 
2126   CHKERRQ(MatDenseGetLDA(B,&blda));
2127   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2128     CHKERRQ(MatDenseCUDAGetArrayWrite(mmdata->X,&carray));
2129     CHKERRQ(MatDenseGetLDA(mmdata->X,&clda));
2130   } else {
2131     CHKERRQ(MatDenseCUDAGetArrayWrite(C,&carray));
2132     CHKERRQ(MatDenseGetLDA(C,&clda));
2133   }
2134 
2135   CHKERRQ(PetscLogGpuTimeBegin());
2136  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2137   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2138   /* (re)allocate mmBuffer if not initialized or LDAs are different */
2139   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2140     size_t mmBufferSize;
2141     if (mmdata->initialized && mmdata->Blda != blda) {CHKERRCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); mmdata->matBDescr = NULL;}
2142     if (!mmdata->matBDescr) {
2143       CHKERRCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL));
2144       mmdata->Blda = blda;
2145     }
2146 
2147     if (mmdata->initialized && mmdata->Clda != clda) {CHKERRCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); mmdata->matCDescr = NULL;}
2148     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2149       CHKERRCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL));
2150       mmdata->Clda = clda;
2151     }
2152 
2153     if (!mat->matDescr) {
2154       stat = cusparseCreateCsr(&mat->matDescr,
2155                                csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
2156                                csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
2157                                csrmat->values->data().get(),
2158                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2159                                CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
2160     }
2161     stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
2162                                    mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2163                                    mmdata->matCDescr,cusparse_scalartype,
2164                                    cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat);
2165     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2166       CHKERRCUDA(cudaFree(mmdata->mmBuffer));
2167       CHKERRCUDA(cudaMalloc(&mmdata->mmBuffer,mmBufferSize));
2168       mmdata->mmBufferSize = mmBufferSize;
2169     }
2170     mmdata->initialized = PETSC_TRUE;
2171   } else {
2172     /* to be safe, always update pointers of the mats */
2173     CHKERRCUSPARSE(cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get()));
2174     CHKERRCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray));
2175     CHKERRCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray));
2176   }
2177 
2178   /* do cusparseSpMM, which supports transpose on B */
2179   stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
2180                       mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2181                       mmdata->matCDescr,cusparse_scalartype,
2182                       cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2183  #else
2184   PetscInt k;
2185   /* cusparseXcsrmm does not support transpose on B */
2186   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2187     cublasHandle_t cublasv2handle;
2188     cublasStatus_t cerr;
2189 
2190     CHKERRQ(PetscCUBLASGetHandle(&cublasv2handle));
2191     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
2192                        B->cmap->n,B->rmap->n,
2193                        &PETSC_CUSPARSE_ONE ,barray,blda,
2194                        &PETSC_CUSPARSE_ZERO,barray,blda,
2195                        mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr);
2196     blda = B->cmap->n;
2197     k    = B->cmap->n;
2198   } else {
2199     k    = B->rmap->n;
2200   }
2201 
2202   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2203   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
2204                            csrmat->num_entries,mat->alpha_one,mat->descr,
2205                            csrmat->values->data().get(),
2206                            csrmat->row_offsets->data().get(),
2207                            csrmat->column_indices->data().get(),
2208                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
2209                            carray,clda);CHKERRCUSPARSE(stat);
2210  #endif
2211   CHKERRQ(PetscLogGpuTimeEnd());
2212   CHKERRQ(PetscLogGpuFlops(n*2.0*csrmat->num_entries));
2213   CHKERRQ(MatDenseCUDARestoreArrayRead(B,&barray));
2214   if (product->type == MATPRODUCT_RARt) {
2215     CHKERRQ(MatDenseCUDARestoreArrayWrite(mmdata->X,&carray));
2216     CHKERRQ(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE));
2217   } else if (product->type == MATPRODUCT_PtAP) {
2218     CHKERRQ(MatDenseCUDARestoreArrayWrite(mmdata->X,&carray));
2219     CHKERRQ(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE));
2220   } else {
2221     CHKERRQ(MatDenseCUDARestoreArrayWrite(C,&carray));
2222   }
2223   if (mmdata->cisdense) {
2224     CHKERRQ(MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C));
2225   }
2226   if (!biscuda) {
2227     CHKERRQ(MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B));
2228   }
2229   PetscFunctionReturn(0);
2230 }
2231 
2232 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2233 {
2234   Mat_Product        *product = C->product;
2235   Mat                A,B;
2236   PetscInt           m,n;
2237   PetscBool          cisdense,flg;
2238   MatMatCusparse     *mmdata;
2239   Mat_SeqAIJCUSPARSE *cusp;
2240 
2241   PetscFunctionBegin;
2242   MatCheckProduct(C,1);
2243   PetscCheck(!C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
2244   A    = product->A;
2245   B    = product->B;
2246   CHKERRQ(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
2247   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2248   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2249   PetscCheckFalse(cusp->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2250   switch (product->type) {
2251   case MATPRODUCT_AB:
2252     m = A->rmap->n;
2253     n = B->cmap->n;
2254     break;
2255   case MATPRODUCT_AtB:
2256     m = A->cmap->n;
2257     n = B->cmap->n;
2258     break;
2259   case MATPRODUCT_ABt:
2260     m = A->rmap->n;
2261     n = B->rmap->n;
2262     break;
2263   case MATPRODUCT_PtAP:
2264     m = B->cmap->n;
2265     n = B->cmap->n;
2266     break;
2267   case MATPRODUCT_RARt:
2268     m = B->rmap->n;
2269     n = B->rmap->n;
2270     break;
2271   default:
2272     SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2273   }
2274   CHKERRQ(MatSetSizes(C,m,n,m,n));
2275   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2276   CHKERRQ(PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense));
2277   CHKERRQ(MatSetType(C,MATSEQDENSECUDA));
2278 
2279   /* product data */
2280   CHKERRQ(PetscNew(&mmdata));
2281   mmdata->cisdense = cisdense;
2282  #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
2283   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2284   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2285     CHKERRCUDA(cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar)));
2286   }
2287  #endif
2288   /* for these products we need intermediate storage */
2289   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2290     CHKERRQ(MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X));
2291     CHKERRQ(MatSetType(mmdata->X,MATSEQDENSECUDA));
2292     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2293       CHKERRQ(MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n));
2294     } else {
2295       CHKERRQ(MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n));
2296     }
2297   }
2298   C->product->data    = mmdata;
2299   C->product->destroy = MatDestroy_MatMatCusparse;
2300 
2301   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2302   PetscFunctionReturn(0);
2303 }
2304 
2305 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2306 {
2307   Mat_Product                  *product = C->product;
2308   Mat                          A,B;
2309   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2310   Mat_SeqAIJ                   *c = (Mat_SeqAIJ*)C->data;
2311   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2312   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2313   PetscBool                    flg;
2314   cusparseStatus_t             stat;
2315   MatProductType               ptype;
2316   MatMatCusparse               *mmdata;
2317 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2318   cusparseSpMatDescr_t         BmatSpDescr;
2319 #endif
2320   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2321 
2322   PetscFunctionBegin;
2323   MatCheckProduct(C,1);
2324   PetscCheck(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
2325   CHKERRQ(PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg));
2326   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for C of type %s",((PetscObject)C)->type_name);
2327   mmdata = (MatMatCusparse*)C->product->data;
2328   A = product->A;
2329   B = product->B;
2330   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2331     mmdata->reusesym = PETSC_FALSE;
2332     Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2333     PetscCheckFalse(Ccusp->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2334     Cmat = Ccusp->mat;
2335     PetscCheck(Cmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]);
2336     Ccsr = (CsrMatrix*)Cmat->mat;
2337     PetscCheck(Ccsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
2338     goto finalize;
2339   }
2340   if (!c->nz) goto finalize;
2341   CHKERRQ(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
2342   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2343   CHKERRQ(PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg));
2344   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
2345   PetscCheck(!A->boundtocpu,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2346   PetscCheck(!B->boundtocpu,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2347   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2348   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2349   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2350   PetscCheckFalse(Acusp->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2351   PetscCheckFalse(Bcusp->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2352   PetscCheckFalse(Ccusp->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2353   CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(A));
2354   CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(B));
2355 
2356   ptype = product->type;
2357   if (A->symmetric && ptype == MATPRODUCT_AtB) {
2358     ptype = MATPRODUCT_AB;
2359     PetscCheck(product->symbolic_used_the_fact_A_is_symmetric,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that A is symmetric");
2360   }
2361   if (B->symmetric && ptype == MATPRODUCT_ABt) {
2362     ptype = MATPRODUCT_AB;
2363     PetscCheck(product->symbolic_used_the_fact_B_is_symmetric,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that B is symmetric");
2364   }
2365   switch (ptype) {
2366   case MATPRODUCT_AB:
2367     Amat = Acusp->mat;
2368     Bmat = Bcusp->mat;
2369     break;
2370   case MATPRODUCT_AtB:
2371     Amat = Acusp->matTranspose;
2372     Bmat = Bcusp->mat;
2373     break;
2374   case MATPRODUCT_ABt:
2375     Amat = Acusp->mat;
2376     Bmat = Bcusp->matTranspose;
2377     break;
2378   default:
2379     SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2380   }
2381   Cmat = Ccusp->mat;
2382   PetscCheck(Amat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2383   PetscCheck(Bmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2384   PetscCheck(Cmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[ptype]);
2385   Acsr = (CsrMatrix*)Amat->mat;
2386   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */
2387   Ccsr = (CsrMatrix*)Cmat->mat;
2388   PetscCheck(Acsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
2389   PetscCheck(Bcsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
2390   PetscCheck(Ccsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
2391   CHKERRQ(PetscLogGpuTimeBegin());
2392 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2393   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2394   CHKERRCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2395   #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2396     stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB,
2397                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2398                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2399                                mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2400   #else
2401     stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2402                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2403                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2404                                mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2405     stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB,
2406                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2407                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2408   #endif
2409 #else
2410   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB,
2411                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2412                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2413                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2414                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2415 #endif
2416   CHKERRQ(PetscLogGpuFlops(mmdata->flops));
2417   CHKERRCUDA(WaitForCUDA());
2418   CHKERRQ(PetscLogGpuTimeEnd());
2419   C->offloadmask = PETSC_OFFLOAD_GPU;
2420 finalize:
2421   /* shorter version of MatAssemblyEnd_SeqAIJ */
2422   CHKERRQ(PetscInfo(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz));
2423   CHKERRQ(PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n"));
2424   CHKERRQ(PetscInfo(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax));
2425   c->reallocs         = 0;
2426   C->info.mallocs    += 0;
2427   C->info.nz_unneeded = 0;
2428   C->assembled = C->was_assembled = PETSC_TRUE;
2429   C->num_ass++;
2430   PetscFunctionReturn(0);
2431 }
2432 
2433 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2434 {
2435   Mat_Product                  *product = C->product;
2436   Mat                          A,B;
2437   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2438   Mat_SeqAIJ                   *a,*b,*c;
2439   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2440   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2441   PetscInt                     i,j,m,n,k;
2442   PetscBool                    flg;
2443   cusparseStatus_t             stat;
2444   MatProductType               ptype;
2445   MatMatCusparse               *mmdata;
2446   PetscLogDouble               flops;
2447   PetscBool                    biscompressed,ciscompressed;
2448 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2449   int64_t                      C_num_rows1, C_num_cols1, C_nnz1;
2450   cusparseSpMatDescr_t         BmatSpDescr;
2451 #else
2452   int                          cnz;
2453 #endif
2454   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2455 
2456   PetscFunctionBegin;
2457   MatCheckProduct(C,1);
2458   PetscCheck(!C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
2459   A    = product->A;
2460   B    = product->B;
2461   CHKERRQ(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
2462   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2463   CHKERRQ(PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg));
2464   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
2465   a = (Mat_SeqAIJ*)A->data;
2466   b = (Mat_SeqAIJ*)B->data;
2467   /* product data */
2468   CHKERRQ(PetscNew(&mmdata));
2469   C->product->data    = mmdata;
2470   C->product->destroy = MatDestroy_MatMatCusparse;
2471 
2472   CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(A));
2473   CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(B));
2474   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */
2475   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2476   PetscCheckFalse(Acusp->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2477   PetscCheckFalse(Bcusp->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2478 
2479   ptype = product->type;
2480   if (A->symmetric && ptype == MATPRODUCT_AtB) {
2481     ptype = MATPRODUCT_AB;
2482     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
2483   }
2484   if (B->symmetric && ptype == MATPRODUCT_ABt) {
2485     ptype = MATPRODUCT_AB;
2486     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
2487   }
2488   biscompressed = PETSC_FALSE;
2489   ciscompressed = PETSC_FALSE;
2490   switch (ptype) {
2491   case MATPRODUCT_AB:
2492     m = A->rmap->n;
2493     n = B->cmap->n;
2494     k = A->cmap->n;
2495     Amat = Acusp->mat;
2496     Bmat = Bcusp->mat;
2497     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2498     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2499     break;
2500   case MATPRODUCT_AtB:
2501     m = A->cmap->n;
2502     n = B->cmap->n;
2503     k = A->rmap->n;
2504     CHKERRQ(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2505     Amat = Acusp->matTranspose;
2506     Bmat = Bcusp->mat;
2507     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2508     break;
2509   case MATPRODUCT_ABt:
2510     m = A->rmap->n;
2511     n = B->rmap->n;
2512     k = A->cmap->n;
2513     CHKERRQ(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
2514     Amat = Acusp->mat;
2515     Bmat = Bcusp->matTranspose;
2516     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2517     break;
2518   default:
2519     SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2520   }
2521 
2522   /* create cusparse matrix */
2523   CHKERRQ(MatSetSizes(C,m,n,m,n));
2524   CHKERRQ(MatSetType(C,MATSEQAIJCUSPARSE));
2525   c     = (Mat_SeqAIJ*)C->data;
2526   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2527   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
2528   Ccsr  = new CsrMatrix;
2529 
2530   c->compressedrow.use = ciscompressed;
2531   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2532     c->compressedrow.nrows = a->compressedrow.nrows;
2533     CHKERRQ(PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex));
2534     CHKERRQ(PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows));
2535     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
2536     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2537     Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows);
2538   } else {
2539     c->compressedrow.nrows  = 0;
2540     c->compressedrow.i      = NULL;
2541     c->compressedrow.rindex = NULL;
2542     Ccusp->workVector       = NULL;
2543     Cmat->cprowIndices      = NULL;
2544   }
2545   Ccusp->nrows    = ciscompressed ? c->compressedrow.nrows : m;
2546   Ccusp->mat      = Cmat;
2547   Ccusp->mat->mat = Ccsr;
2548   Ccsr->num_rows    = Ccusp->nrows;
2549   Ccsr->num_cols    = n;
2550   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1);
2551   CHKERRCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
2552   CHKERRCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
2553   CHKERRCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
2554   CHKERRCUDA(cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar)));
2555   CHKERRCUDA(cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar)));
2556   CHKERRCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar)));
2557   CHKERRCUDA(cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
2558   CHKERRCUDA(cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice));
2559   CHKERRCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
2560   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
2561     thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0);
2562     c->nz = 0;
2563     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2564     Ccsr->values = new THRUSTARRAY(c->nz);
2565     goto finalizesym;
2566   }
2567 
2568   PetscCheck(Amat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2569   PetscCheck(Bmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2570   Acsr = (CsrMatrix*)Amat->mat;
2571   if (!biscompressed) {
2572     Bcsr = (CsrMatrix*)Bmat->mat;
2573 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2574     BmatSpDescr = Bmat->matDescr;
2575 #endif
2576   } else { /* we need to use row offsets for the full matrix */
2577     CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat;
2578     Bcsr = new CsrMatrix;
2579     Bcsr->num_rows       = B->rmap->n;
2580     Bcsr->num_cols       = cBcsr->num_cols;
2581     Bcsr->num_entries    = cBcsr->num_entries;
2582     Bcsr->column_indices = cBcsr->column_indices;
2583     Bcsr->values         = cBcsr->values;
2584     if (!Bcusp->rowoffsets_gpu) {
2585       Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
2586       Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
2587       CHKERRQ(PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt)));
2588     }
2589     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2590     mmdata->Bcsr = Bcsr;
2591 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2592     if (Bcsr->num_rows && Bcsr->num_cols) {
2593       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries,
2594                                Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2595                                Bcsr->values->data().get(),
2596                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2597                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2598     }
2599     BmatSpDescr = mmdata->matSpBDescr;
2600 #endif
2601   }
2602   PetscCheck(Acsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
2603   PetscCheck(Bcsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
2604   /* precompute flops count */
2605   if (ptype == MATPRODUCT_AB) {
2606     for (i=0, flops = 0; i<A->rmap->n; i++) {
2607       const PetscInt st = a->i[i];
2608       const PetscInt en = a->i[i+1];
2609       for (j=st; j<en; j++) {
2610         const PetscInt brow = a->j[j];
2611         flops += 2.*(b->i[brow+1] - b->i[brow]);
2612       }
2613     }
2614   } else if (ptype == MATPRODUCT_AtB) {
2615     for (i=0, flops = 0; i<A->rmap->n; i++) {
2616       const PetscInt anzi = a->i[i+1] - a->i[i];
2617       const PetscInt bnzi = b->i[i+1] - b->i[i];
2618       flops += (2.*anzi)*bnzi;
2619     }
2620   } else { /* TODO */
2621     flops = 0.;
2622   }
2623 
2624   mmdata->flops = flops;
2625   CHKERRQ(PetscLogGpuTimeBegin());
2626 
2627 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2628   CHKERRCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2629   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0,
2630                           NULL, NULL, NULL,
2631                           CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2632                           CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2633   CHKERRCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc));
2634  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2635  {
2636   /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
2637      We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
2638   */
2639   void*  dBuffer1 = NULL;
2640   void*  dBuffer2 = NULL;
2641   void*  dBuffer3 = NULL;
2642   /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
2643   size_t bufferSize1 = 0;
2644   size_t bufferSize2 = 0;
2645   size_t bufferSize3 = 0;
2646   size_t bufferSize4 = 0;
2647   size_t bufferSize5 = 0;
2648 
2649   /*----------------------------------------------------------------------*/
2650   /* ask bufferSize1 bytes for external memory */
2651   stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2652                                             CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2653                                             &bufferSize1, NULL);CHKERRCUSPARSE(stat);
2654   CHKERRCUDA(cudaMalloc((void**) &dBuffer1, bufferSize1));
2655   /* inspect the matrices A and B to understand the memory requirement for the next step */
2656   stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2657                                             CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2658                                             &bufferSize1, dBuffer1);CHKERRCUSPARSE(stat);
2659 
2660   /*----------------------------------------------------------------------*/
2661   stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2662                                  CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2663                                  &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);CHKERRCUSPARSE(stat);
2664   CHKERRCUDA(cudaMalloc((void**) &dBuffer2, bufferSize2));
2665   CHKERRCUDA(cudaMalloc((void**) &dBuffer3, bufferSize3));
2666   CHKERRCUDA(cudaMalloc((void**) &mmdata->dBuffer4, bufferSize4));
2667   stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2668                                  CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2669                                  &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);CHKERRCUSPARSE(stat);
2670   CHKERRCUDA(cudaFree(dBuffer1));
2671   CHKERRCUDA(cudaFree(dBuffer2));
2672 
2673   /*----------------------------------------------------------------------*/
2674   /* get matrix C non-zero entries C_nnz1 */
2675   CHKERRCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
2676   c->nz = (PetscInt) C_nnz1;
2677   /* allocate matrix C */
2678   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2679   Ccsr->values         = new THRUSTARRAY(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2680   /* update matC with the new pointers */
2681   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2682                                 Ccsr->values->data().get());CHKERRCUSPARSE(stat);
2683 
2684   /*----------------------------------------------------------------------*/
2685   stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2686                                   CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2687                                   &bufferSize5, NULL);CHKERRCUSPARSE(stat);
2688   CHKERRCUDA(cudaMalloc((void**) &mmdata->dBuffer5, bufferSize5));
2689   stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2690                                   CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2691                                   &bufferSize5, mmdata->dBuffer5);CHKERRCUSPARSE(stat);
2692   CHKERRCUDA(cudaFree(dBuffer3));
2693   stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB,
2694                                      Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2695                                      cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2696                                      mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2697   CHKERRQ(PetscInfo(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufferSize4/1024,bufferSize5/1024));
2698  }
2699  #else
2700   size_t bufSize2;
2701   /* ask bufferSize bytes for external memory */
2702   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB,
2703                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2704                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2705                                        mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat);
2706   CHKERRCUDA(cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2));
2707   /* inspect the matrices A and B to understand the memory requirement for the next step */
2708   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB,
2709                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2710                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2711                                        mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat);
2712   /* ask bufferSize again bytes for external memory */
2713   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2714                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2715                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2716                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat);
2717   /* The CUSPARSE documentation is not clear, nor the API
2718      We need both buffers to perform the operations properly!
2719      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2720      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2721      is stored in the descriptor! What a messy API... */
2722   CHKERRCUDA(cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize));
2723   /* compute the intermediate product of A * B */
2724   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2725                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2726                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2727                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2728   /* get matrix C non-zero entries C_nnz1 */
2729   CHKERRCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
2730   c->nz = (PetscInt) C_nnz1;
2731   CHKERRQ(PetscInfo(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024));
2732   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2733   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2734   Ccsr->values = new THRUSTARRAY(c->nz);
2735   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2736   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2737                                 Ccsr->values->data().get());CHKERRCUSPARSE(stat);
2738   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB,
2739                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2740                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2741  #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2742 #else
2743   CHKERRCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST));
2744   stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB,
2745                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2746                              Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2747                              Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2748                              Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat);
2749   c->nz = cnz;
2750   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2751   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2752   Ccsr->values = new THRUSTARRAY(c->nz);
2753   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2754 
2755   CHKERRCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2756   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
2757      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
2758      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
2759   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB,
2760                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2761                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2762                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2763                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2764 #endif
2765   CHKERRQ(PetscLogGpuFlops(mmdata->flops));
2766   CHKERRQ(PetscLogGpuTimeEnd());
2767 finalizesym:
2768   c->singlemalloc = PETSC_FALSE;
2769   c->free_a       = PETSC_TRUE;
2770   c->free_ij      = PETSC_TRUE;
2771   CHKERRQ(PetscMalloc1(m+1,&c->i));
2772   CHKERRQ(PetscMalloc1(c->nz,&c->j));
2773   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
2774     PetscInt *d_i = c->i;
2775     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
2776     THRUSTINTARRAY jj(Ccsr->column_indices->size());
2777     ii   = *Ccsr->row_offsets;
2778     jj   = *Ccsr->column_indices;
2779     if (ciscompressed) d_i = c->compressedrow.i;
2780     CHKERRCUDA(cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
2781     CHKERRCUDA(cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
2782   } else {
2783     PetscInt *d_i = c->i;
2784     if (ciscompressed) d_i = c->compressedrow.i;
2785     CHKERRCUDA(cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
2786     CHKERRCUDA(cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
2787   }
2788   if (ciscompressed) { /* need to expand host row offsets */
2789     PetscInt r = 0;
2790     c->i[0] = 0;
2791     for (k = 0; k < c->compressedrow.nrows; k++) {
2792       const PetscInt next = c->compressedrow.rindex[k];
2793       const PetscInt old = c->compressedrow.i[k];
2794       for (; r < next; r++) c->i[r+1] = old;
2795     }
2796     for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows];
2797   }
2798   CHKERRQ(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt)));
2799   CHKERRQ(PetscMalloc1(m,&c->ilen));
2800   CHKERRQ(PetscMalloc1(m,&c->imax));
2801   c->maxnz = c->nz;
2802   c->nonzerorowcnt = 0;
2803   c->rmax = 0;
2804   for (k = 0; k < m; k++) {
2805     const PetscInt nn = c->i[k+1] - c->i[k];
2806     c->ilen[k] = c->imax[k] = nn;
2807     c->nonzerorowcnt += (PetscInt)!!nn;
2808     c->rmax = PetscMax(c->rmax,nn);
2809   }
2810   CHKERRQ(MatMarkDiagonal_SeqAIJ(C));
2811   CHKERRQ(PetscMalloc1(c->nz,&c->a));
2812   Ccsr->num_entries = c->nz;
2813 
2814   C->nonzerostate++;
2815   CHKERRQ(PetscLayoutSetUp(C->rmap));
2816   CHKERRQ(PetscLayoutSetUp(C->cmap));
2817   Ccusp->nonzerostate = C->nonzerostate;
2818   C->offloadmask   = PETSC_OFFLOAD_UNALLOCATED;
2819   C->preallocated  = PETSC_TRUE;
2820   C->assembled     = PETSC_FALSE;
2821   C->was_assembled = PETSC_FALSE;
2822   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
2823     mmdata->reusesym = PETSC_TRUE;
2824     C->offloadmask   = PETSC_OFFLOAD_GPU;
2825   }
2826   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2827   PetscFunctionReturn(0);
2828 }
2829 
2830 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
2831 
2832 /* handles sparse or dense B */
2833 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
2834 {
2835   Mat_Product    *product = mat->product;
2836   PetscErrorCode ierr;
2837   PetscBool      isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE;
2838 
2839   PetscFunctionBegin;
2840   MatCheckProduct(mat,1);
2841   CHKERRQ(PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense));
2842   if (!product->A->boundtocpu && !product->B->boundtocpu) {
2843     CHKERRQ(PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp));
2844   }
2845   if (product->type == MATPRODUCT_ABC) {
2846     Ciscusp = PETSC_FALSE;
2847     if (!product->C->boundtocpu) {
2848       CHKERRQ(PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp));
2849     }
2850   }
2851   if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
2852     PetscBool usecpu = PETSC_FALSE;
2853     switch (product->type) {
2854     case MATPRODUCT_AB:
2855       if (product->api_user) {
2856         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMult","Mat");CHKERRQ(ierr);
2857         CHKERRQ(PetscOptionsBool("-matmatmult_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL));
2858         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2859       } else {
2860         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AB","Mat");CHKERRQ(ierr);
2861         CHKERRQ(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL));
2862         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2863       }
2864       break;
2865     case MATPRODUCT_AtB:
2866       if (product->api_user) {
2867         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatTransposeMatMult","Mat");CHKERRQ(ierr);
2868         CHKERRQ(PetscOptionsBool("-mattransposematmult_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL));
2869         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2870       } else {
2871         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AtB","Mat");CHKERRQ(ierr);
2872         CHKERRQ(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL));
2873         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2874       }
2875       break;
2876     case MATPRODUCT_PtAP:
2877       if (product->api_user) {
2878         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatPtAP","Mat");CHKERRQ(ierr);
2879         CHKERRQ(PetscOptionsBool("-matptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL));
2880         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2881       } else {
2882         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_PtAP","Mat");CHKERRQ(ierr);
2883         CHKERRQ(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL));
2884         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2885       }
2886       break;
2887     case MATPRODUCT_RARt:
2888       if (product->api_user) {
2889         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatRARt","Mat");CHKERRQ(ierr);
2890         CHKERRQ(PetscOptionsBool("-matrart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL));
2891         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2892       } else {
2893         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_RARt","Mat");CHKERRQ(ierr);
2894         CHKERRQ(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL));
2895         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2896       }
2897       break;
2898     case MATPRODUCT_ABC:
2899       if (product->api_user) {
2900         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMatMult","Mat");CHKERRQ(ierr);
2901         CHKERRQ(PetscOptionsBool("-matmatmatmult_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL));
2902         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2903       } else {
2904         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_ABC","Mat");CHKERRQ(ierr);
2905         CHKERRQ(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL));
2906         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2907       }
2908       break;
2909     default:
2910       break;
2911     }
2912     if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
2913   }
2914   /* dispatch */
2915   if (isdense) {
2916     switch (product->type) {
2917     case MATPRODUCT_AB:
2918     case MATPRODUCT_AtB:
2919     case MATPRODUCT_ABt:
2920     case MATPRODUCT_PtAP:
2921     case MATPRODUCT_RARt:
2922      if (product->A->boundtocpu) {
2923         CHKERRQ(MatProductSetFromOptions_SeqAIJ_SeqDense(mat));
2924       } else {
2925         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2926       }
2927       break;
2928     case MATPRODUCT_ABC:
2929       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2930       break;
2931     default:
2932       break;
2933     }
2934   } else if (Biscusp && Ciscusp) {
2935     switch (product->type) {
2936     case MATPRODUCT_AB:
2937     case MATPRODUCT_AtB:
2938     case MATPRODUCT_ABt:
2939       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2940       break;
2941     case MATPRODUCT_PtAP:
2942     case MATPRODUCT_RARt:
2943     case MATPRODUCT_ABC:
2944       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2945       break;
2946     default:
2947       break;
2948     }
2949   } else { /* fallback for AIJ */
2950     CHKERRQ(MatProductSetFromOptions_SeqAIJ(mat));
2951   }
2952   PetscFunctionReturn(0);
2953 }
2954 
2955 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2956 {
2957   PetscFunctionBegin;
2958   CHKERRQ(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE));
2959   PetscFunctionReturn(0);
2960 }
2961 
2962 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
2963 {
2964   PetscFunctionBegin;
2965   CHKERRQ(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE));
2966   PetscFunctionReturn(0);
2967 }
2968 
2969 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2970 {
2971   PetscFunctionBegin;
2972   CHKERRQ(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE));
2973   PetscFunctionReturn(0);
2974 }
2975 
2976 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2977 {
2978   PetscFunctionBegin;
2979   CHKERRQ(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE));
2980   PetscFunctionReturn(0);
2981 }
2982 
2983 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2984 {
2985   PetscFunctionBegin;
2986   CHKERRQ(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE));
2987   PetscFunctionReturn(0);
2988 }
2989 
2990 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y)
2991 {
2992   int i = blockIdx.x*blockDim.x + threadIdx.x;
2993   if (i < n) y[idx[i]] += x[i];
2994 }
2995 
2996 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
2997 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
2998 {
2999   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
3000   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
3001   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
3002   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
3003   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
3004   PetscBool                    compressed;
3005 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3006   PetscInt                     nx,ny;
3007 #endif
3008 
3009   PetscFunctionBegin;
3010   PetscCheckFalse(herm && !trans,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Hermitian and not transpose not supported");
3011   if (!a->nz) {
3012     if (!yy) CHKERRQ(VecSet_SeqCUDA(zz,0));
3013     else CHKERRQ(VecCopy_SeqCUDA(yy,zz));
3014     PetscFunctionReturn(0);
3015   }
3016   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
3017   CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(A));
3018   if (!trans) {
3019     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
3020     PetscCheck(matstruct,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
3021   } else {
3022     if (herm || !A->form_explicit_transpose) {
3023       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
3024       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
3025     } else {
3026       if (!cusparsestruct->matTranspose) CHKERRQ(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3027       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
3028     }
3029   }
3030   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3031   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
3032 
3033   try {
3034     CHKERRQ(VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray));
3035     if (yy == zz) CHKERRQ(VecCUDAGetArray(zz,&zarray)); /* read & write zz, so need to get uptodate zarray on GPU */
3036     else CHKERRQ(VecCUDAGetArrayWrite(zz,&zarray)); /* write zz, so no need to init zarray on GPU */
3037 
3038     CHKERRQ(PetscLogGpuTimeBegin());
3039     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3040       /* z = A x + beta y.
3041          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3042          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3043       */
3044       xptr = xarray;
3045       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
3046       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3047      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3048       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3049           allocated to accommodate different uses. So we get the length info directly from mat.
3050        */
3051       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3052         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3053         nx = mat->num_cols;
3054         ny = mat->num_rows;
3055       }
3056      #endif
3057     } else {
3058       /* z = A^T x + beta y
3059          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3060          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3061        */
3062       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
3063       dptr = zarray;
3064       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3065       if (compressed) { /* Scatter x to work vector */
3066         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3067         thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
3068                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3069                          VecCUDAEqualsReverse());
3070       }
3071      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3072       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3073         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3074         nx = mat->num_rows;
3075         ny = mat->num_cols;
3076       }
3077      #endif
3078     }
3079 
3080     /* csr_spmv does y = alpha op(A) x + beta y */
3081     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3082      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3083       PetscCheck(opA >= 0 && opA <= 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
3084       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
3085         CHKERRCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype));
3086         CHKERRCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype));
3087         CHKERRCUSPARSE(cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
3088                                                matstruct->matDescr,
3089                                                matstruct->cuSpMV[opA].vecXDescr, beta,
3090                                                matstruct->cuSpMV[opA].vecYDescr,
3091                                                cusparse_scalartype,
3092                                                cusparsestruct->spmvAlg,
3093                                                &matstruct->cuSpMV[opA].spmvBufferSize));
3094         CHKERRCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize));
3095 
3096         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3097       } else {
3098         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
3099         CHKERRCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr));
3100         CHKERRCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr));
3101       }
3102 
3103       CHKERRCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA,
3104                                   matstruct->alpha_one,
3105                                   matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */
3106                                   matstruct->cuSpMV[opA].vecXDescr,
3107                                   beta,
3108                                   matstruct->cuSpMV[opA].vecYDescr,
3109                                   cusparse_scalartype,
3110                                   cusparsestruct->spmvAlg,
3111                                   matstruct->cuSpMV[opA].spmvBuffer));
3112      #else
3113       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3114       CHKERRCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA,
3115                                        mat->num_rows, mat->num_cols,
3116                                        mat->num_entries, matstruct->alpha_one, matstruct->descr,
3117                                        mat->values->data().get(), mat->row_offsets->data().get(),
3118                                        mat->column_indices->data().get(), xptr, beta,
3119                                        dptr));
3120      #endif
3121     } else {
3122       if (cusparsestruct->nrows) {
3123        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3124         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3125        #else
3126         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
3127         CHKERRCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA,
3128                                          matstruct->alpha_one, matstruct->descr, hybMat,
3129                                          xptr, beta,
3130                                          dptr));
3131        #endif
3132       }
3133     }
3134     CHKERRQ(PetscLogGpuTimeEnd());
3135 
3136     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3137       if (yy) { /* MatMultAdd: zz = A*xx + yy */
3138         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3139           CHKERRQ(VecCopy_SeqCUDA(yy,zz)); /* zz = yy */
3140         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3141           CHKERRQ(VecAXPY_SeqCUDA(zz,1.0,yy)); /* zz += yy */
3142         }
3143       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3144         CHKERRQ(VecSet_SeqCUDA(zz,0));
3145       }
3146 
3147       /* ScatterAdd the result from work vector into the full vector when A is compressed */
3148       if (compressed) {
3149         CHKERRQ(PetscLogGpuTimeBegin());
3150         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred)
3151            and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
3152            prevent that. So I just add a ScatterAdd kernel.
3153          */
3154        #if 0
3155         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
3156         thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
3157                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
3158                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3159                          VecCUDAPlusEquals());
3160        #else
3161         PetscInt n = matstruct->cprowIndices->size();
3162         ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray);
3163        #endif
3164         CHKERRQ(PetscLogGpuTimeEnd());
3165       }
3166     } else {
3167       if (yy && yy != zz) {
3168         CHKERRQ(VecAXPY_SeqCUDA(zz,1.0,yy)); /* zz += yy */
3169       }
3170     }
3171     CHKERRQ(VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray));
3172     if (yy == zz) CHKERRQ(VecCUDARestoreArray(zz,&zarray));
3173     else CHKERRQ(VecCUDARestoreArrayWrite(zz,&zarray));
3174   } catch(char *ex) {
3175     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
3176   }
3177   if (yy) {
3178     CHKERRQ(PetscLogGpuFlops(2.0*a->nz));
3179   } else {
3180     CHKERRQ(PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt));
3181   }
3182   PetscFunctionReturn(0);
3183 }
3184 
3185 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
3186 {
3187   PetscFunctionBegin;
3188   CHKERRQ(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE));
3189   PetscFunctionReturn(0);
3190 }
3191 
3192 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
3193 {
3194   PetscObjectState   onnz = A->nonzerostate;
3195   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3196 
3197   PetscFunctionBegin;
3198   CHKERRQ(MatAssemblyEnd_SeqAIJ(A,mode));
3199   if (onnz != A->nonzerostate && cusp->deviceMat) {
3200 
3201     CHKERRQ(PetscInfo(A,"Destroy device mat since nonzerostate changed\n"));
3202     CHKERRCUDA(cudaFree(cusp->deviceMat));
3203     cusp->deviceMat = NULL;
3204   }
3205   PetscFunctionReturn(0);
3206 }
3207 
3208 /* --------------------------------------------------------------------------------*/
3209 /*@
3210    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
3211    (the default parallel PETSc format). This matrix will ultimately pushed down
3212    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
3213    assembly performance the user should preallocate the matrix storage by setting
3214    the parameter nz (or the array nnz).  By setting these parameters accurately,
3215    performance during matrix assembly can be increased by more than a factor of 50.
3216 
3217    Collective
3218 
3219    Input Parameters:
3220 +  comm - MPI communicator, set to PETSC_COMM_SELF
3221 .  m - number of rows
3222 .  n - number of columns
3223 .  nz - number of nonzeros per row (same for all rows)
3224 -  nnz - array containing the number of nonzeros in the various rows
3225          (possibly different for each row) or NULL
3226 
3227    Output Parameter:
3228 .  A - the matrix
3229 
3230    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
3231    MatXXXXSetPreallocation() paradgm instead of this routine directly.
3232    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
3233 
3234    Notes:
3235    If nnz is given then nz is ignored
3236 
3237    The AIJ format (also called the Yale sparse matrix format or
3238    compressed row storage), is fully compatible with standard Fortran 77
3239    storage.  That is, the stored row and column indices can begin at
3240    either one (as in Fortran) or zero.  See the users' manual for details.
3241 
3242    Specify the preallocated storage with either nz or nnz (not both).
3243    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
3244    allocation.  For large problems you MUST preallocate memory or you
3245    will get TERRIBLE performance, see the users' manual chapter on matrices.
3246 
3247    By default, this format uses inodes (identical nodes) when possible, to
3248    improve numerical efficiency of matrix-vector products and solves. We
3249    search for consecutive rows with the same nonzero structure, thereby
3250    reusing matrix information to achieve increased efficiency.
3251 
3252    Level: intermediate
3253 
3254 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
3255 @*/
3256 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
3257 {
3258   PetscFunctionBegin;
3259   CHKERRQ(MatCreate(comm,A));
3260   CHKERRQ(MatSetSizes(*A,m,n,m,n));
3261   CHKERRQ(MatSetType(*A,MATSEQAIJCUSPARSE));
3262   CHKERRQ(MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz));
3263   PetscFunctionReturn(0);
3264 }
3265 
3266 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3267 {
3268   PetscFunctionBegin;
3269   if (A->factortype == MAT_FACTOR_NONE) {
3270     CHKERRQ(MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr));
3271   } else {
3272     CHKERRQ(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr));
3273   }
3274   CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL));
3275   CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL));
3276   CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetUseCPUSolve_C",NULL));
3277   CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL));
3278   CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL));
3279   CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL));
3280   CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL));
3281   CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL));
3282   CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL));
3283   CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaijcusparse_hypre_C",NULL));
3284   CHKERRQ(MatDestroy_SeqAIJ(A));
3285   PetscFunctionReturn(0);
3286 }
3287 
3288 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
3289 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
3290 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
3291 {
3292   PetscFunctionBegin;
3293   CHKERRQ(MatDuplicate_SeqAIJ(A,cpvalues,B));
3294   CHKERRQ(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B));
3295   PetscFunctionReturn(0);
3296 }
3297 
3298 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str)
3299 {
3300   Mat_SeqAIJ         *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
3301   Mat_SeqAIJCUSPARSE *cy;
3302   Mat_SeqAIJCUSPARSE *cx;
3303   PetscScalar        *ay;
3304   const PetscScalar  *ax;
3305   CsrMatrix          *csry,*csrx;
3306 
3307   PetscFunctionBegin;
3308   cy = (Mat_SeqAIJCUSPARSE*)Y->spptr;
3309   cx = (Mat_SeqAIJCUSPARSE*)X->spptr;
3310   if (X->ops->axpy != Y->ops->axpy) {
3311     CHKERRQ(MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE));
3312     CHKERRQ(MatAXPY_SeqAIJ(Y,a,X,str));
3313     PetscFunctionReturn(0);
3314   }
3315   /* if we are here, it means both matrices are bound to GPU */
3316   CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(Y));
3317   CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(X));
3318   PetscCheck(cy->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)Y),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
3319   PetscCheck(cx->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)X),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
3320   csry = (CsrMatrix*)cy->mat->mat;
3321   csrx = (CsrMatrix*)cx->mat->mat;
3322   /* see if we can turn this into a cublas axpy */
3323   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3324     bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin());
3325     if (eq) {
3326       eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin());
3327     }
3328     if (eq) str = SAME_NONZERO_PATTERN;
3329   }
3330   /* spgeam is buggy with one column */
3331   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3332 
3333   if (str == SUBSET_NONZERO_PATTERN) {
3334     PetscScalar b = 1.0;
3335 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3336     size_t      bufferSize;
3337     void        *buffer;
3338 #endif
3339 
3340     CHKERRQ(MatSeqAIJCUSPARSEGetArrayRead(X,&ax));
3341     CHKERRQ(MatSeqAIJCUSPARSEGetArray(Y,&ay));
3342     CHKERRCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST));
3343 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3344     CHKERRCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n,
3345                                                   &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3346                                                   &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3347                                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize));
3348     CHKERRCUDA(cudaMalloc(&buffer,bufferSize));
3349     CHKERRQ(PetscLogGpuTimeBegin());
3350     CHKERRCUSPARSE(cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3351                                        &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3352                                        &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3353                                        cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer));
3354     CHKERRQ(PetscLogGpuFlops(x->nz + y->nz));
3355     CHKERRQ(PetscLogGpuTimeEnd());
3356     CHKERRCUDA(cudaFree(buffer));
3357 #else
3358     CHKERRQ(PetscLogGpuTimeBegin());
3359     CHKERRCUSPARSE(cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3360                                        &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3361                                        &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3362                                        cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get()));
3363     CHKERRQ(PetscLogGpuFlops(x->nz + y->nz));
3364     CHKERRQ(PetscLogGpuTimeEnd());
3365 #endif
3366     CHKERRCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE));
3367     CHKERRQ(MatSeqAIJCUSPARSERestoreArrayRead(X,&ax));
3368     CHKERRQ(MatSeqAIJCUSPARSERestoreArray(Y,&ay));
3369     CHKERRQ(MatSeqAIJInvalidateDiagonal(Y));
3370   } else if (str == SAME_NONZERO_PATTERN) {
3371     cublasHandle_t cublasv2handle;
3372     PetscBLASInt   one = 1, bnz = 1;
3373 
3374     CHKERRQ(MatSeqAIJCUSPARSEGetArrayRead(X,&ax));
3375     CHKERRQ(MatSeqAIJCUSPARSEGetArray(Y,&ay));
3376     CHKERRQ(PetscCUBLASGetHandle(&cublasv2handle));
3377     CHKERRQ(PetscBLASIntCast(x->nz,&bnz));
3378     CHKERRQ(PetscLogGpuTimeBegin());
3379     CHKERRCUBLAS(cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one));
3380     CHKERRQ(PetscLogGpuFlops(2.0*bnz));
3381     CHKERRQ(PetscLogGpuTimeEnd());
3382     CHKERRQ(MatSeqAIJCUSPARSERestoreArrayRead(X,&ax));
3383     CHKERRQ(MatSeqAIJCUSPARSERestoreArray(Y,&ay));
3384     CHKERRQ(MatSeqAIJInvalidateDiagonal(Y));
3385   } else {
3386     CHKERRQ(MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE));
3387     CHKERRQ(MatAXPY_SeqAIJ(Y,a,X,str));
3388   }
3389   PetscFunctionReturn(0);
3390 }
3391 
3392 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a)
3393 {
3394   Mat_SeqAIJ     *y = (Mat_SeqAIJ*)Y->data;
3395   PetscScalar    *ay;
3396   cublasHandle_t cublasv2handle;
3397   PetscBLASInt   one = 1, bnz = 1;
3398 
3399   PetscFunctionBegin;
3400   CHKERRQ(MatSeqAIJCUSPARSEGetArray(Y,&ay));
3401   CHKERRQ(PetscCUBLASGetHandle(&cublasv2handle));
3402   CHKERRQ(PetscBLASIntCast(y->nz,&bnz));
3403   CHKERRQ(PetscLogGpuTimeBegin());
3404   CHKERRCUBLAS(cublasXscal(cublasv2handle,bnz,&a,ay,one));
3405   CHKERRQ(PetscLogGpuFlops(bnz));
3406   CHKERRQ(PetscLogGpuTimeEnd());
3407   CHKERRQ(MatSeqAIJCUSPARSERestoreArray(Y,&ay));
3408   CHKERRQ(MatSeqAIJInvalidateDiagonal(Y));
3409   PetscFunctionReturn(0);
3410 }
3411 
3412 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3413 {
3414   PetscBool      both = PETSC_FALSE;
3415   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3416 
3417   PetscFunctionBegin;
3418   if (A->factortype == MAT_FACTOR_NONE) {
3419     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
3420     if (spptr->mat) {
3421       CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat;
3422       if (matrix->values) {
3423         both = PETSC_TRUE;
3424         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3425       }
3426     }
3427     if (spptr->matTranspose) {
3428       CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat;
3429       if (matrix->values) {
3430         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3431       }
3432     }
3433   }
3434   CHKERRQ(PetscArrayzero(a->a,a->i[A->rmap->n]));
3435   CHKERRQ(MatSeqAIJInvalidateDiagonal(A));
3436   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3437   else A->offloadmask = PETSC_OFFLOAD_CPU;
3438   PetscFunctionReturn(0);
3439 }
3440 
3441 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
3442 {
3443   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3444 
3445   PetscFunctionBegin;
3446   if (A->factortype != MAT_FACTOR_NONE) {
3447     A->boundtocpu = flg;
3448     PetscFunctionReturn(0);
3449   }
3450   if (flg) {
3451     CHKERRQ(MatSeqAIJCUSPARSECopyFromGPU(A));
3452 
3453     A->ops->scale                     = MatScale_SeqAIJ;
3454     A->ops->axpy                      = MatAXPY_SeqAIJ;
3455     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3456     A->ops->mult                      = MatMult_SeqAIJ;
3457     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3458     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3459     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3460     A->ops->multhermitiantranspose    = NULL;
3461     A->ops->multhermitiantransposeadd = NULL;
3462     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
3463     CHKERRQ(PetscMemzero(a->ops,sizeof(Mat_SeqAIJOps)));
3464     CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL));
3465     CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL));
3466     CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL));
3467     CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL));
3468     CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL));
3469     CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ));
3470     CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL));
3471   } else {
3472     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
3473     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3474     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3475     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
3476     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
3477     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
3478     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
3479     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3480     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3481     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
3482     a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJCUSPARSE;
3483     a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJCUSPARSE;
3484     a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE;
3485     a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE;
3486     a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE;
3487     a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE;
3488     CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE));
3489     CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE));
3490     CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE));
3491     CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE));
3492     CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE));
3493     CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE));
3494   }
3495   A->boundtocpu = flg;
3496   if (flg && a->inode.size) {
3497     a->inode.use = PETSC_TRUE;
3498   } else {
3499     a->inode.use = PETSC_FALSE;
3500   }
3501   PetscFunctionReturn(0);
3502 }
3503 
3504 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
3505 {
3506   Mat              B;
3507 
3508   PetscFunctionBegin;
3509   CHKERRQ(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */
3510   if (reuse == MAT_INITIAL_MATRIX) {
3511     CHKERRQ(MatDuplicate(A,MAT_COPY_VALUES,newmat));
3512   } else if (reuse == MAT_REUSE_MATRIX) {
3513     CHKERRQ(MatCopy(A,*newmat,SAME_NONZERO_PATTERN));
3514   }
3515   B = *newmat;
3516 
3517   CHKERRQ(PetscFree(B->defaultvectype));
3518   CHKERRQ(PetscStrallocpy(VECCUDA,&B->defaultvectype));
3519 
3520   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
3521     if (B->factortype == MAT_FACTOR_NONE) {
3522       Mat_SeqAIJCUSPARSE *spptr;
3523       CHKERRQ(PetscNew(&spptr));
3524       CHKERRCUSPARSE(cusparseCreate(&spptr->handle));
3525       CHKERRCUSPARSE(cusparseSetStream(spptr->handle,PetscDefaultCudaStream));
3526       spptr->format     = MAT_CUSPARSE_CSR;
3527      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3528      #if PETSC_PKG_CUDA_VERSION_GE(11,2,0)
3529       spptr->spmvAlg    = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
3530      #else
3531       spptr->spmvAlg    = CUSPARSE_CSRMV_ALG1;    /* default, since we only support csr */
3532      #endif
3533       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
3534       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
3535      #endif
3536       B->spptr = spptr;
3537     } else {
3538       Mat_SeqAIJCUSPARSETriFactors *spptr;
3539 
3540       CHKERRQ(PetscNew(&spptr));
3541       CHKERRCUSPARSE(cusparseCreate(&spptr->handle));
3542       CHKERRCUSPARSE(cusparseSetStream(spptr->handle,PetscDefaultCudaStream));
3543       B->spptr = spptr;
3544     }
3545     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
3546   }
3547   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
3548   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
3549   B->ops->setoption      = MatSetOption_SeqAIJCUSPARSE;
3550   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
3551   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
3552   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
3553 
3554   CHKERRQ(MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE));
3555   CHKERRQ(PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE));
3556   CHKERRQ(PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE));
3557 #if defined(PETSC_HAVE_HYPRE)
3558   CHKERRQ(PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijcusparse_hypre_C",MatConvert_AIJ_HYPRE));
3559 #endif
3560   CHKERRQ(PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetUseCPUSolve_C",MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE));
3561   PetscFunctionReturn(0);
3562 }
3563 
3564 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
3565 {
3566   PetscFunctionBegin;
3567   CHKERRQ(MatCreate_SeqAIJ(B));
3568   CHKERRQ(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B));
3569   PetscFunctionReturn(0);
3570 }
3571 
3572 /*MC
3573    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
3574 
3575    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
3576    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
3577    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
3578 
3579    Options Database Keys:
3580 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
3581 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3582 -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3583 +  -mat_cusparse_use_cpu_solve - Do MatSolve on CPU
3584 
3585   Level: beginner
3586 
3587 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
3588 M*/
3589 
3590 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*);
3591 
3592 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
3593 {
3594   PetscFunctionBegin;
3595   CHKERRQ(MatSolverTypeRegister(MATSOLVERCUSPARSEBAND,MATSEQAIJ,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band));
3596   CHKERRQ(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse));
3597   CHKERRQ(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse));
3598   CHKERRQ(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse));
3599   CHKERRQ(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse));
3600 
3601   PetscFunctionReturn(0);
3602 }
3603 
3604 static PetscErrorCode MatResetPreallocationCOO_SeqAIJCUSPARSE(Mat mat)
3605 {
3606   Mat_SeqAIJCUSPARSE* cusp = (Mat_SeqAIJCUSPARSE*)mat->spptr;
3607 
3608   PetscFunctionBegin;
3609   if (!cusp) PetscFunctionReturn(0);
3610   delete cusp->cooPerm;
3611   delete cusp->cooPerm_a;
3612   cusp->cooPerm = NULL;
3613   cusp->cooPerm_a = NULL;
3614   if (cusp->use_extended_coo) {
3615     CHKERRCUDA(cudaFree(cusp->jmap_d));
3616     CHKERRCUDA(cudaFree(cusp->perm_d));
3617   }
3618   cusp->use_extended_coo = PETSC_FALSE;
3619   PetscFunctionReturn(0);
3620 }
3621 
3622 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
3623 {
3624   PetscFunctionBegin;
3625   if (*cusparsestruct) {
3626     CHKERRQ(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format));
3627     CHKERRQ(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format));
3628     delete (*cusparsestruct)->workVector;
3629     delete (*cusparsestruct)->rowoffsets_gpu;
3630     delete (*cusparsestruct)->cooPerm;
3631     delete (*cusparsestruct)->cooPerm_a;
3632     delete (*cusparsestruct)->csr2csc_i;
3633     if ((*cusparsestruct)->handle) CHKERRCUSPARSE(cusparseDestroy((*cusparsestruct)->handle));
3634     if ((*cusparsestruct)->jmap_d) CHKERRCUDA(cudaFree((*cusparsestruct)->jmap_d));
3635     if ((*cusparsestruct)->perm_d) CHKERRCUDA(cudaFree((*cusparsestruct)->perm_d));
3636     CHKERRQ(PetscFree(*cusparsestruct));
3637   }
3638   PetscFunctionReturn(0);
3639 }
3640 
3641 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
3642 {
3643   PetscFunctionBegin;
3644   if (*mat) {
3645     delete (*mat)->values;
3646     delete (*mat)->column_indices;
3647     delete (*mat)->row_offsets;
3648     delete *mat;
3649     *mat = 0;
3650   }
3651   PetscFunctionReturn(0);
3652 }
3653 
3654 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
3655 {
3656   PetscFunctionBegin;
3657   if (*trifactor) {
3658     if ((*trifactor)->descr) CHKERRCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr));
3659     if ((*trifactor)->solveInfo) CHKERRCUSPARSE(cusparse_destroy_analysis_info((*trifactor)->solveInfo));
3660     CHKERRQ(CsrMatrix_Destroy(&(*trifactor)->csrMat));
3661     if ((*trifactor)->solveBuffer)   CHKERRCUDA(cudaFree((*trifactor)->solveBuffer));
3662     if ((*trifactor)->AA_h)   CHKERRCUDA(cudaFreeHost((*trifactor)->AA_h));
3663    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3664     if ((*trifactor)->csr2cscBuffer) CHKERRCUDA(cudaFree((*trifactor)->csr2cscBuffer));
3665    #endif
3666     CHKERRQ(PetscFree(*trifactor));
3667   }
3668   PetscFunctionReturn(0);
3669 }
3670 
3671 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
3672 {
3673   CsrMatrix        *mat;
3674 
3675   PetscFunctionBegin;
3676   if (*matstruct) {
3677     if ((*matstruct)->mat) {
3678       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
3679        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3680         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3681        #else
3682         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
3683         CHKERRCUSPARSE(cusparseDestroyHybMat(hybMat));
3684        #endif
3685       } else {
3686         mat = (CsrMatrix*)(*matstruct)->mat;
3687         CsrMatrix_Destroy(&mat);
3688       }
3689     }
3690     if ((*matstruct)->descr) CHKERRCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr));
3691     delete (*matstruct)->cprowIndices;
3692     if ((*matstruct)->alpha_one) CHKERRCUDA(cudaFree((*matstruct)->alpha_one));
3693     if ((*matstruct)->beta_zero) CHKERRCUDA(cudaFree((*matstruct)->beta_zero));
3694     if ((*matstruct)->beta_one)  CHKERRCUDA(cudaFree((*matstruct)->beta_one));
3695 
3696    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3697     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
3698     if (mdata->matDescr) CHKERRCUSPARSE(cusparseDestroySpMat(mdata->matDescr));
3699     for (int i=0; i<3; i++) {
3700       if (mdata->cuSpMV[i].initialized) {
3701         CHKERRCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer));
3702         CHKERRCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr));
3703         CHKERRCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr));
3704       }
3705     }
3706    #endif
3707     delete *matstruct;
3708     *matstruct = NULL;
3709   }
3710   PetscFunctionReturn(0);
3711 }
3712 
3713 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p* trifactors)
3714 {
3715   PetscFunctionBegin;
3716   if (*trifactors) {
3717     CHKERRQ(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr));
3718     CHKERRQ(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr));
3719     CHKERRQ(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose));
3720     CHKERRQ(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose));
3721     delete (*trifactors)->rpermIndices;
3722     delete (*trifactors)->cpermIndices;
3723     delete (*trifactors)->workVector;
3724     (*trifactors)->rpermIndices = NULL;
3725     (*trifactors)->cpermIndices = NULL;
3726     (*trifactors)->workVector = NULL;
3727     if ((*trifactors)->a_band_d)   CHKERRCUDA(cudaFree((*trifactors)->a_band_d));
3728     if ((*trifactors)->i_band_d)   CHKERRCUDA(cudaFree((*trifactors)->i_band_d));
3729     (*trifactors)->init_dev_prop = PETSC_FALSE;
3730   }
3731   PetscFunctionReturn(0);
3732 }
3733 
3734 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
3735 {
3736   cusparseHandle_t handle;
3737 
3738   PetscFunctionBegin;
3739   if (*trifactors) {
3740     CHKERRQ(MatSeqAIJCUSPARSETriFactors_Reset(trifactors));
3741     if (handle = (*trifactors)->handle) {
3742       CHKERRCUSPARSE(cusparseDestroy(handle));
3743     }
3744     CHKERRQ(PetscFree(*trifactors));
3745   }
3746   PetscFunctionReturn(0);
3747 }
3748 
3749 struct IJCompare
3750 {
3751   __host__ __device__
3752   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3753   {
3754     if (t1.get<0>() < t2.get<0>()) return true;
3755     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3756     return false;
3757   }
3758 };
3759 
3760 struct IJEqual
3761 {
3762   __host__ __device__
3763   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3764   {
3765     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
3766     return true;
3767   }
3768 };
3769 
3770 struct IJDiff
3771 {
3772   __host__ __device__
3773   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3774   {
3775     return t1 == t2 ? 0 : 1;
3776   }
3777 };
3778 
3779 struct IJSum
3780 {
3781   __host__ __device__
3782   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3783   {
3784     return t1||t2;
3785   }
3786 };
3787 
3788 #include <thrust/iterator/discard_iterator.h>
3789 /* Associated with MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic() */
3790 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode)
3791 {
3792   Mat_SeqAIJCUSPARSE                    *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3793   Mat_SeqAIJ                            *a = (Mat_SeqAIJ*)A->data;
3794   THRUSTARRAY                           *cooPerm_v = NULL;
3795   thrust::device_ptr<const PetscScalar> d_v;
3796   CsrMatrix                             *matrix;
3797   PetscInt                              n;
3798 
3799   PetscFunctionBegin;
3800   PetscCheck(cusp,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct");
3801   PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix");
3802   if (!cusp->cooPerm) {
3803     CHKERRQ(MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY));
3804     CHKERRQ(MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY));
3805     PetscFunctionReturn(0);
3806   }
3807   matrix = (CsrMatrix*)cusp->mat->mat;
3808   PetscCheck(matrix->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3809   if (!v) {
3810     if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3811     goto finalize;
3812   }
3813   n = cusp->cooPerm->size();
3814   if (isCudaMem(v)) {
3815     d_v = thrust::device_pointer_cast(v);
3816   } else {
3817     cooPerm_v = new THRUSTARRAY(n);
3818     cooPerm_v->assign(v,v+n);
3819     d_v = cooPerm_v->data();
3820     CHKERRQ(PetscLogCpuToGpu(n*sizeof(PetscScalar)));
3821   }
3822   CHKERRQ(PetscLogGpuTimeBegin());
3823   if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
3824     if (cusp->cooPerm_a) { /* there are repeated entries in d_v[], and we need to add these them */
3825       THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
3826       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3827       /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output)
3828         cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[].
3829         cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero.
3830       */
3831       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3832       thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>());
3833       delete cooPerm_w;
3834     } else {
3835       /* all nonzeros in d_v[] are unique entries */
3836       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3837                                                                 matrix->values->begin()));
3838       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3839                                                                 matrix->values->end()));
3840       thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]]  */
3841     }
3842   } else {
3843     if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
3844       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3845       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3846     } else {
3847       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3848                                                                 matrix->values->begin()));
3849       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3850                                                                 matrix->values->end()));
3851       thrust::for_each(zibit,zieit,VecCUDAEquals());
3852     }
3853   }
3854   CHKERRQ(PetscLogGpuTimeEnd());
3855 finalize:
3856   delete cooPerm_v;
3857   A->offloadmask = PETSC_OFFLOAD_GPU;
3858   CHKERRQ(PetscObjectStateIncrease((PetscObject)A));
3859   /* shorter version of MatAssemblyEnd_SeqAIJ */
3860   CHKERRQ(PetscInfo(A,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",A->rmap->n,A->cmap->n,a->nz));
3861   CHKERRQ(PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n"));
3862   CHKERRQ(PetscInfo(A,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",a->rmax));
3863   a->reallocs         = 0;
3864   A->info.mallocs    += 0;
3865   A->info.nz_unneeded = 0;
3866   A->assembled = A->was_assembled = PETSC_TRUE;
3867   A->num_ass++;
3868   PetscFunctionReturn(0);
3869 }
3870 
3871 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
3872 {
3873   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3874 
3875   PetscFunctionBegin;
3876   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3877   if (!cusp) PetscFunctionReturn(0);
3878   if (destroy) {
3879     CHKERRQ(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format));
3880     delete cusp->csr2csc_i;
3881     cusp->csr2csc_i = NULL;
3882   }
3883   A->transupdated = PETSC_FALSE;
3884   PetscFunctionReturn(0);
3885 }
3886 
3887 #include <thrust/binary_search.h>
3888 /* 'Basic' means it only works when coo_i[] and coo_j[] do not contain negative indices */
3889 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(Mat A, PetscCount n, const PetscInt coo_i[], const PetscInt coo_j[])
3890 {
3891   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3892   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
3893   PetscInt           cooPerm_n, nzr = 0;
3894 
3895   PetscFunctionBegin;
3896   CHKERRQ(PetscLayoutSetUp(A->rmap));
3897   CHKERRQ(PetscLayoutSetUp(A->cmap));
3898   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
3899   if (n != cooPerm_n) {
3900     delete cusp->cooPerm;
3901     delete cusp->cooPerm_a;
3902     cusp->cooPerm = NULL;
3903     cusp->cooPerm_a = NULL;
3904   }
3905   if (n) {
3906     THRUSTINTARRAY d_i(n);
3907     THRUSTINTARRAY d_j(n);
3908     THRUSTINTARRAY ii(A->rmap->n);
3909 
3910     if (!cusp->cooPerm)   { cusp->cooPerm   = new THRUSTINTARRAY(n); }
3911     if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); }
3912 
3913     CHKERRQ(PetscLogCpuToGpu(2.*n*sizeof(PetscInt)));
3914     d_i.assign(coo_i,coo_i+n);
3915     d_j.assign(coo_j,coo_j+n);
3916 
3917     /* Ex.
3918       n = 6
3919       coo_i = [3,3,1,4,1,4]
3920       coo_j = [3,2,2,5,2,6]
3921     */
3922     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin()));
3923     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end()));
3924 
3925     CHKERRQ(PetscLogGpuTimeBegin());
3926     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
3927     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */
3928     *cusp->cooPerm_a = d_i; /* copy the sorted array */
3929     THRUSTINTARRAY w = d_j;
3930 
3931     /*
3932       d_i     = [1,1,3,3,4,4]
3933       d_j     = [2,2,2,3,5,6]
3934       cooPerm = [2,4,1,0,3,5]
3935     */
3936     auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */
3937 
3938     /*
3939       d_i     = [1,3,3,4,4,x]
3940                             ^ekey
3941       d_j     = [2,2,3,5,6,x]
3942                            ^nekye
3943     */
3944     if (nekey == ekey) { /* all entries are unique */
3945       delete cusp->cooPerm_a;
3946       cusp->cooPerm_a = NULL;
3947     } else { /* Stefano: I couldn't come up with a more elegant algorithm */
3948       /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */
3949       adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/
3950       adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff());                                              /* w:         [2,2,2,3,5,6] => [2,0,0,1,1,1]*/
3951       (*cusp->cooPerm_a)[0] = 0; /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */
3952       w[0] = 0;
3953       thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); /* cooPerm_a =          [0,0,1,1,1,1]*/
3954       thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/
3955     }
3956     thrust::counting_iterator<PetscInt> search_begin(0);
3957     thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */
3958                         search_begin, search_begin + A->rmap->n,  /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */
3959                         ii.begin()); /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */
3960     CHKERRQ(PetscLogGpuTimeEnd());
3961 
3962     CHKERRQ(MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i));
3963     a->singlemalloc = PETSC_FALSE;
3964     a->free_a       = PETSC_TRUE;
3965     a->free_ij      = PETSC_TRUE;
3966     CHKERRQ(PetscMalloc1(A->rmap->n+1,&a->i));
3967     a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */
3968     CHKERRCUDA(cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost));
3969     a->nz = a->maxnz = a->i[A->rmap->n];
3970     a->rmax = 0;
3971     CHKERRQ(PetscMalloc1(a->nz,&a->a));
3972     CHKERRQ(PetscMalloc1(a->nz,&a->j));
3973     CHKERRCUDA(cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost));
3974     if (!a->ilen) CHKERRQ(PetscMalloc1(A->rmap->n,&a->ilen));
3975     if (!a->imax) CHKERRQ(PetscMalloc1(A->rmap->n,&a->imax));
3976     for (PetscInt i = 0; i < A->rmap->n; i++) {
3977       const PetscInt nnzr = a->i[i+1] - a->i[i];
3978       nzr += (PetscInt)!!(nnzr);
3979       a->ilen[i] = a->imax[i] = nnzr;
3980       a->rmax = PetscMax(a->rmax,nnzr);
3981     }
3982     a->nonzerorowcnt = nzr;
3983     A->preallocated = PETSC_TRUE;
3984     CHKERRQ(PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt)));
3985     CHKERRQ(MatMarkDiagonal_SeqAIJ(A));
3986   } else {
3987     CHKERRQ(MatSeqAIJSetPreallocation(A,0,NULL));
3988   }
3989   CHKERRQ(MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE));
3990 
3991   /* We want to allocate the CUSPARSE struct for matvec now.
3992      The code is so convoluted now that I prefer to copy zeros */
3993   CHKERRQ(PetscArrayzero(a->a,a->nz));
3994   CHKERRQ(MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6));
3995   A->offloadmask = PETSC_OFFLOAD_CPU;
3996   CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(A));
3997   CHKERRQ(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE));
3998   PetscFunctionReturn(0);
3999 }
4000 
4001 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[])
4002 {
4003   Mat_SeqAIJ         *seq;
4004   Mat_SeqAIJCUSPARSE *dev;
4005   PetscBool          coo_basic = PETSC_TRUE;
4006   PetscMemType       mtype = PETSC_MEMTYPE_DEVICE;
4007 
4008   PetscFunctionBegin;
4009   CHKERRQ(MatResetPreallocationCOO_SeqAIJ(mat));
4010   CHKERRQ(MatResetPreallocationCOO_SeqAIJCUSPARSE(mat));
4011   if (coo_i) {
4012     CHKERRQ(PetscGetMemType(coo_i,&mtype));
4013     if (PetscMemTypeHost(mtype)) {
4014       for (PetscCount k=0; k<coo_n; k++) {
4015         if (coo_i[k] < 0 || coo_j[k] < 0) {coo_basic = PETSC_FALSE; break;}
4016       }
4017     }
4018   }
4019 
4020   if (coo_basic) { /* i,j are on device or do not contain negative indices */
4021     CHKERRQ(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(mat,coo_n,coo_i,coo_j));
4022   } else {
4023     CHKERRQ(MatSetPreallocationCOO_SeqAIJ(mat,coo_n,coo_i,coo_j));
4024     mat->offloadmask = PETSC_OFFLOAD_CPU;
4025     CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(mat));
4026     seq  = static_cast<Mat_SeqAIJ*>(mat->data);
4027     dev  = static_cast<Mat_SeqAIJCUSPARSE*>(mat->spptr);
4028     CHKERRCUDA(cudaMalloc((void**)&dev->jmap_d,(seq->nz+1)*sizeof(PetscCount)));
4029     CHKERRCUDA(cudaMemcpy(dev->jmap_d,seq->jmap,(seq->nz+1)*sizeof(PetscCount),cudaMemcpyHostToDevice));
4030     CHKERRCUDA(cudaMalloc((void**)&dev->perm_d,seq->Atot*sizeof(PetscCount)));
4031     CHKERRCUDA(cudaMemcpy(dev->perm_d,seq->perm,seq->Atot*sizeof(PetscCount),cudaMemcpyHostToDevice));
4032     dev->use_extended_coo = PETSC_TRUE;
4033   }
4034   PetscFunctionReturn(0);
4035 }
4036 
4037 __global__ void MatAddCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount jmap[],const PetscCount perm[],InsertMode imode,PetscScalar a[])
4038 {
4039   PetscCount        i = blockIdx.x*blockDim.x + threadIdx.x;
4040   const PetscCount  grid_size = gridDim.x * blockDim.x;
4041   for (; i<nnz; i+= grid_size) {
4042     PetscScalar sum = 0.0;
4043     for (PetscCount k=jmap[i]; k<jmap[i+1]; k++) sum += kv[perm[k]];
4044     a[i] = (imode == INSERT_VALUES? 0.0 : a[i]) + sum;
4045   }
4046 }
4047 
4048 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
4049 {
4050   Mat_SeqAIJ          *seq = (Mat_SeqAIJ*)A->data;
4051   Mat_SeqAIJCUSPARSE  *dev = (Mat_SeqAIJCUSPARSE*)A->spptr;
4052   PetscCount          Annz = seq->nz;
4053   PetscMemType        memtype;
4054   const PetscScalar   *v1 = v;
4055   PetscScalar         *Aa;
4056 
4057   PetscFunctionBegin;
4058   if (dev->use_extended_coo) {
4059     CHKERRQ(PetscGetMemType(v,&memtype));
4060     if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
4061       CHKERRCUDA(cudaMalloc((void**)&v1,seq->coo_n*sizeof(PetscScalar)));
4062       CHKERRCUDA(cudaMemcpy((void*)v1,v,seq->coo_n*sizeof(PetscScalar),cudaMemcpyHostToDevice));
4063     }
4064 
4065     if (imode == INSERT_VALUES) CHKERRQ(MatSeqAIJCUSPARSEGetArrayWrite(A,&Aa));
4066     else CHKERRQ(MatSeqAIJCUSPARSEGetArray(A,&Aa));
4067 
4068     if (Annz) {
4069       MatAddCOOValues<<<(Annz+255)/256,256>>>(v1,Annz,dev->jmap_d,dev->perm_d,imode,Aa);
4070       CHKERRCUDA(cudaPeekAtLastError());
4071     }
4072 
4073     if (imode == INSERT_VALUES) CHKERRQ(MatSeqAIJCUSPARSERestoreArrayWrite(A,&Aa));
4074     else CHKERRQ(MatSeqAIJCUSPARSERestoreArray(A,&Aa));
4075 
4076     if (PetscMemTypeHost(memtype)) CHKERRCUDA(cudaFree((void*)v1));
4077   } else {
4078     CHKERRQ(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(A,v,imode));
4079   }
4080   PetscFunctionReturn(0);
4081 }
4082 
4083 /*@C
4084     MatSeqAIJCUSPARSEGetIJ - returns the device row storage i and j indices for MATSEQAIJCUSPARSE matrices.
4085 
4086    Not collective
4087 
4088     Input Parameters:
4089 +   A - the matrix
4090 -   compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form
4091 
4092     Output Parameters:
4093 +   ia - the CSR row pointers
4094 -   ja - the CSR column indices
4095 
4096     Level: developer
4097 
4098     Notes:
4099       When compressed is true, the CSR structure does not contain empty rows
4100 
4101 .seealso: MatSeqAIJCUSPARSERestoreIJ(), MatSeqAIJCUSPARSEGetArrayRead()
4102 @*/
4103 PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int** i, const int **j)
4104 {
4105   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4106   CsrMatrix          *csr;
4107   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
4108 
4109   PetscFunctionBegin;
4110   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4111   if (!i || !j) PetscFunctionReturn(0);
4112   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4113   PetscCheckFalse(cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4114   CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(A));
4115   PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4116   csr = (CsrMatrix*)cusp->mat->mat;
4117   if (i) {
4118     if (!compressed && a->compressedrow.use) { /* need full row offset */
4119       if (!cusp->rowoffsets_gpu) {
4120         cusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
4121         cusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
4122         CHKERRQ(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt)));
4123       }
4124       *i = cusp->rowoffsets_gpu->data().get();
4125     } else *i = csr->row_offsets->data().get();
4126   }
4127   if (j) *j = csr->column_indices->data().get();
4128   PetscFunctionReturn(0);
4129 }
4130 
4131 /*@C
4132     MatSeqAIJCUSPARSERestoreIJ - restore the device row storage i and j indices obtained with MatSeqAIJCUSPARSEGetIJ()
4133 
4134    Not collective
4135 
4136     Input Parameters:
4137 +   A - the matrix
4138 -   compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form
4139 
4140     Output Parameters:
4141 +   ia - the CSR row pointers
4142 -   ja - the CSR column indices
4143 
4144     Level: developer
4145 
4146 .seealso: MatSeqAIJCUSPARSEGetIJ()
4147 @*/
4148 PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int** i, const int **j)
4149 {
4150   PetscFunctionBegin;
4151   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4152   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4153   if (i) *i = NULL;
4154   if (j) *j = NULL;
4155   PetscFunctionReturn(0);
4156 }
4157 
4158 /*@C
4159    MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
4160 
4161    Not Collective
4162 
4163    Input Parameter:
4164 .   A - a MATSEQAIJCUSPARSE matrix
4165 
4166    Output Parameter:
4167 .   a - pointer to the device data
4168 
4169    Level: developer
4170 
4171    Notes: may trigger host-device copies if up-to-date matrix data is on host
4172 
4173 .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArrayRead()
4174 @*/
4175 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a)
4176 {
4177   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4178   CsrMatrix          *csr;
4179 
4180   PetscFunctionBegin;
4181   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4182   PetscValidPointer(a,2);
4183   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4184   PetscCheckFalse(cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4185   CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(A));
4186   PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4187   csr = (CsrMatrix*)cusp->mat->mat;
4188   PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4189   *a = csr->values->data().get();
4190   PetscFunctionReturn(0);
4191 }
4192 
4193 /*@C
4194    MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from MatSeqAIJCUSPARSEGetArrayRead()
4195 
4196    Not Collective
4197 
4198    Input Parameter:
4199 .   A - a MATSEQAIJCUSPARSE matrix
4200 
4201    Output Parameter:
4202 .   a - pointer to the device data
4203 
4204    Level: developer
4205 
4206 .seealso: MatSeqAIJCUSPARSEGetArrayRead()
4207 @*/
4208 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a)
4209 {
4210   PetscFunctionBegin;
4211   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4212   PetscValidPointer(a,2);
4213   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4214   *a = NULL;
4215   PetscFunctionReturn(0);
4216 }
4217 
4218 /*@C
4219    MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
4220 
4221    Not Collective
4222 
4223    Input Parameter:
4224 .   A - a MATSEQAIJCUSPARSE matrix
4225 
4226    Output Parameter:
4227 .   a - pointer to the device data
4228 
4229    Level: developer
4230 
4231    Notes: may trigger host-device copies if up-to-date matrix data is on host
4232 
4233 .seealso: MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArray()
4234 @*/
4235 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a)
4236 {
4237   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4238   CsrMatrix          *csr;
4239 
4240   PetscFunctionBegin;
4241   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4242   PetscValidPointer(a,2);
4243   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4244   PetscCheckFalse(cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4245   CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(A));
4246   PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4247   csr = (CsrMatrix*)cusp->mat->mat;
4248   PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4249   *a = csr->values->data().get();
4250   A->offloadmask = PETSC_OFFLOAD_GPU;
4251   CHKERRQ(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE));
4252   PetscFunctionReturn(0);
4253 }
4254 /*@C
4255    MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from MatSeqAIJCUSPARSEGetArray()
4256 
4257    Not Collective
4258 
4259    Input Parameter:
4260 .   A - a MATSEQAIJCUSPARSE matrix
4261 
4262    Output Parameter:
4263 .   a - pointer to the device data
4264 
4265    Level: developer
4266 
4267 .seealso: MatSeqAIJCUSPARSEGetArray()
4268 @*/
4269 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a)
4270 {
4271   PetscFunctionBegin;
4272   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4273   PetscValidPointer(a,2);
4274   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4275   CHKERRQ(MatSeqAIJInvalidateDiagonal(A));
4276   CHKERRQ(PetscObjectStateIncrease((PetscObject)A));
4277   *a = NULL;
4278   PetscFunctionReturn(0);
4279 }
4280 
4281 /*@C
4282    MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
4283 
4284    Not Collective
4285 
4286    Input Parameter:
4287 .   A - a MATSEQAIJCUSPARSE matrix
4288 
4289    Output Parameter:
4290 .   a - pointer to the device data
4291 
4292    Level: developer
4293 
4294    Notes: does not trigger host-device copies and flags data validity on the GPU
4295 
4296 .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSERestoreArrayWrite()
4297 @*/
4298 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a)
4299 {
4300   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4301   CsrMatrix          *csr;
4302 
4303   PetscFunctionBegin;
4304   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4305   PetscValidPointer(a,2);
4306   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4307   PetscCheckFalse(cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4308   PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4309   csr = (CsrMatrix*)cusp->mat->mat;
4310   PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4311   *a = csr->values->data().get();
4312   A->offloadmask = PETSC_OFFLOAD_GPU;
4313   CHKERRQ(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE));
4314   PetscFunctionReturn(0);
4315 }
4316 
4317 /*@C
4318    MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from MatSeqAIJCUSPARSEGetArrayWrite()
4319 
4320    Not Collective
4321 
4322    Input Parameter:
4323 .   A - a MATSEQAIJCUSPARSE matrix
4324 
4325    Output Parameter:
4326 .   a - pointer to the device data
4327 
4328    Level: developer
4329 
4330 .seealso: MatSeqAIJCUSPARSEGetArrayWrite()
4331 @*/
4332 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a)
4333 {
4334   PetscFunctionBegin;
4335   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4336   PetscValidPointer(a,2);
4337   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4338   CHKERRQ(MatSeqAIJInvalidateDiagonal(A));
4339   CHKERRQ(PetscObjectStateIncrease((PetscObject)A));
4340   *a = NULL;
4341   PetscFunctionReturn(0);
4342 }
4343 
4344 struct IJCompare4
4345 {
4346   __host__ __device__
4347   inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4348   {
4349     if (t1.get<0>() < t2.get<0>()) return true;
4350     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
4351     return false;
4352   }
4353 };
4354 
4355 struct Shift
4356 {
4357   int _shift;
4358 
4359   Shift(int shift) : _shift(shift) {}
4360   __host__ __device__
4361   inline int operator() (const int &c)
4362   {
4363     return c + _shift;
4364   }
4365 };
4366 
4367 /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */
4368 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
4369 {
4370   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c;
4371   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp;
4372   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
4373   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
4374   PetscInt                     Annz,Bnnz;
4375   cusparseStatus_t             stat;
4376   PetscInt                     i,m,n,zero = 0;
4377 
4378   PetscFunctionBegin;
4379   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4380   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
4381   PetscValidPointer(C,4);
4382   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4383   PetscCheckTypeName(B,MATSEQAIJCUSPARSE);
4384   PetscCheck(A->rmap->n == B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n);
4385   PetscCheckFalse(reuse == MAT_INPLACE_MATRIX,PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
4386   PetscCheckFalse(Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4387   PetscCheckFalse(Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4388   if (reuse == MAT_INITIAL_MATRIX) {
4389     m     = A->rmap->n;
4390     n     = A->cmap->n + B->cmap->n;
4391     CHKERRQ(MatCreate(PETSC_COMM_SELF,C));
4392     CHKERRQ(MatSetSizes(*C,m,n,m,n));
4393     CHKERRQ(MatSetType(*C,MATSEQAIJCUSPARSE));
4394     c     = (Mat_SeqAIJ*)(*C)->data;
4395     Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4396     Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
4397     Ccsr  = new CsrMatrix;
4398     Cmat->cprowIndices      = NULL;
4399     c->compressedrow.use    = PETSC_FALSE;
4400     c->compressedrow.nrows  = 0;
4401     c->compressedrow.i      = NULL;
4402     c->compressedrow.rindex = NULL;
4403     Ccusp->workVector       = NULL;
4404     Ccusp->nrows    = m;
4405     Ccusp->mat      = Cmat;
4406     Ccusp->mat->mat = Ccsr;
4407     Ccsr->num_rows  = m;
4408     Ccsr->num_cols  = n;
4409     CHKERRCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
4410     CHKERRCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
4411     CHKERRCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4412     CHKERRCUDA(cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar)));
4413     CHKERRCUDA(cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar)));
4414     CHKERRCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar)));
4415     CHKERRCUDA(cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
4416     CHKERRCUDA(cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice));
4417     CHKERRCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
4418     CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(A));
4419     CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(B));
4420     PetscCheck(Acusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4421     PetscCheck(Bcusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4422 
4423     Acsr = (CsrMatrix*)Acusp->mat->mat;
4424     Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4425     Annz = (PetscInt)Acsr->column_indices->size();
4426     Bnnz = (PetscInt)Bcsr->column_indices->size();
4427     c->nz = Annz + Bnnz;
4428     Ccsr->row_offsets = new THRUSTINTARRAY32(m+1);
4429     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4430     Ccsr->values = new THRUSTARRAY(c->nz);
4431     Ccsr->num_entries = c->nz;
4432     Ccusp->cooPerm = new THRUSTINTARRAY(c->nz);
4433     if (c->nz) {
4434       auto Acoo = new THRUSTINTARRAY32(Annz);
4435       auto Bcoo = new THRUSTINTARRAY32(Bnnz);
4436       auto Ccoo = new THRUSTINTARRAY32(c->nz);
4437       THRUSTINTARRAY32 *Aroff,*Broff;
4438 
4439       if (a->compressedrow.use) { /* need full row offset */
4440         if (!Acusp->rowoffsets_gpu) {
4441           Acusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
4442           Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
4443           CHKERRQ(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt)));
4444         }
4445         Aroff = Acusp->rowoffsets_gpu;
4446       } else Aroff = Acsr->row_offsets;
4447       if (b->compressedrow.use) { /* need full row offset */
4448         if (!Bcusp->rowoffsets_gpu) {
4449           Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
4450           Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
4451           CHKERRQ(PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt)));
4452         }
4453         Broff = Bcusp->rowoffsets_gpu;
4454       } else Broff = Bcsr->row_offsets;
4455       CHKERRQ(PetscLogGpuTimeBegin());
4456       stat = cusparseXcsr2coo(Acusp->handle,
4457                               Aroff->data().get(),
4458                               Annz,
4459                               m,
4460                               Acoo->data().get(),
4461                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4462       stat = cusparseXcsr2coo(Bcusp->handle,
4463                               Broff->data().get(),
4464                               Bnnz,
4465                               m,
4466                               Bcoo->data().get(),
4467                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4468       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
4469       auto Aperm = thrust::make_constant_iterator(1);
4470       auto Bperm = thrust::make_constant_iterator(0);
4471 #if PETSC_PKG_CUDA_VERSION_GE(10,0,0)
4472       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n));
4473       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n));
4474 #else
4475       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
4476       auto Bcib = Bcsr->column_indices->begin();
4477       auto Bcie = Bcsr->column_indices->end();
4478       thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n));
4479 #endif
4480       auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz);
4481       auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm));
4482       auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm));
4483       auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm));
4484       auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm));
4485       auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin()));
4486       auto p1 = Ccusp->cooPerm->begin();
4487       auto p2 = Ccusp->cooPerm->begin();
4488       thrust::advance(p2,Annz);
4489       PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4()));
4490 #if PETSC_PKG_CUDA_VERSION_LT(10,0,0)
4491       thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n));
4492 #endif
4493       auto cci = thrust::make_counting_iterator(zero);
4494       auto cce = thrust::make_counting_iterator(c->nz);
4495 #if 0 //Errors on SUMMIT cuda 11.1.0
4496       PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
4497 #else
4498       auto pred = thrust::identity<int>();
4499       PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred));
4500       PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred));
4501 #endif
4502       stat = cusparseXcoo2csr(Ccusp->handle,
4503                               Ccoo->data().get(),
4504                               c->nz,
4505                               m,
4506                               Ccsr->row_offsets->data().get(),
4507                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4508       CHKERRQ(PetscLogGpuTimeEnd());
4509       delete wPerm;
4510       delete Acoo;
4511       delete Bcoo;
4512       delete Ccoo;
4513 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4514       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries,
4515                                Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(),
4516                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4517                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4518 #endif
4519       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
4520         CHKERRQ(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
4521         CHKERRQ(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
4522         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4523         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4524         CsrMatrix *CcsrT = new CsrMatrix;
4525         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4526         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4527 
4528         (*C)->form_explicit_transpose = PETSC_TRUE;
4529         (*C)->transupdated = PETSC_TRUE;
4530         Ccusp->rowoffsets_gpu = NULL;
4531         CmatT->cprowIndices = NULL;
4532         CmatT->mat = CcsrT;
4533         CcsrT->num_rows = n;
4534         CcsrT->num_cols = m;
4535         CcsrT->num_entries = c->nz;
4536 
4537         CcsrT->row_offsets = new THRUSTINTARRAY32(n+1);
4538         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4539         CcsrT->values = new THRUSTARRAY(c->nz);
4540 
4541         CHKERRQ(PetscLogGpuTimeBegin());
4542         auto rT = CcsrT->row_offsets->begin();
4543         if (AT) {
4544           rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT);
4545           thrust::advance(rT,-1);
4546         }
4547         if (BT) {
4548           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz));
4549           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz));
4550           thrust::copy(titb,tite,rT);
4551         }
4552         auto cT = CcsrT->column_indices->begin();
4553         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT);
4554         if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT);
4555         auto vT = CcsrT->values->begin();
4556         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4557         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4558         CHKERRQ(PetscLogGpuTimeEnd());
4559 
4560         CHKERRCUSPARSE(cusparseCreateMatDescr(&CmatT->descr));
4561         CHKERRCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO));
4562         CHKERRCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4563         CHKERRCUDA(cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar)));
4564         CHKERRCUDA(cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar)));
4565         CHKERRCUDA(cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar)));
4566         CHKERRCUDA(cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
4567         CHKERRCUDA(cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice));
4568         CHKERRCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
4569 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4570         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries,
4571                                  CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(),
4572                                  CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4573                                  CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4574 #endif
4575         Ccusp->matTranspose = CmatT;
4576       }
4577     }
4578 
4579     c->singlemalloc = PETSC_FALSE;
4580     c->free_a       = PETSC_TRUE;
4581     c->free_ij      = PETSC_TRUE;
4582     CHKERRQ(PetscMalloc1(m+1,&c->i));
4583     CHKERRQ(PetscMalloc1(c->nz,&c->j));
4584     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
4585       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4586       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4587       ii   = *Ccsr->row_offsets;
4588       jj   = *Ccsr->column_indices;
4589       CHKERRCUDA(cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
4590       CHKERRCUDA(cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
4591     } else {
4592       CHKERRCUDA(cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
4593       CHKERRCUDA(cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
4594     }
4595     CHKERRQ(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt)));
4596     CHKERRQ(PetscMalloc1(m,&c->ilen));
4597     CHKERRQ(PetscMalloc1(m,&c->imax));
4598     c->maxnz = c->nz;
4599     c->nonzerorowcnt = 0;
4600     c->rmax = 0;
4601     for (i = 0; i < m; i++) {
4602       const PetscInt nn = c->i[i+1] - c->i[i];
4603       c->ilen[i] = c->imax[i] = nn;
4604       c->nonzerorowcnt += (PetscInt)!!nn;
4605       c->rmax = PetscMax(c->rmax,nn);
4606     }
4607     CHKERRQ(MatMarkDiagonal_SeqAIJ(*C));
4608     CHKERRQ(PetscMalloc1(c->nz,&c->a));
4609     (*C)->nonzerostate++;
4610     CHKERRQ(PetscLayoutSetUp((*C)->rmap));
4611     CHKERRQ(PetscLayoutSetUp((*C)->cmap));
4612     Ccusp->nonzerostate = (*C)->nonzerostate;
4613     (*C)->preallocated  = PETSC_TRUE;
4614   } else {
4615     PetscCheckFalse((*C)->rmap->n != B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,(*C)->rmap->n,B->rmap->n);
4616     c = (Mat_SeqAIJ*)(*C)->data;
4617     if (c->nz) {
4618       Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4619       PetscCheck(Ccusp->cooPerm,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm");
4620       PetscCheckFalse(Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4621       PetscCheckFalse(Ccusp->nonzerostate != (*C)->nonzerostate,PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate");
4622       CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(A));
4623       CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(B));
4624       PetscCheck(Acusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4625       PetscCheck(Bcusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4626       Acsr = (CsrMatrix*)Acusp->mat->mat;
4627       Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4628       Ccsr = (CsrMatrix*)Ccusp->mat->mat;
4629       PetscCheckFalse(Acsr->num_entries != (PetscInt)Acsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %" PetscInt_FMT " != %" PetscInt_FMT,Acsr->num_entries,(PetscInt)Acsr->values->size());
4630       PetscCheckFalse(Bcsr->num_entries != (PetscInt)Bcsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %" PetscInt_FMT " != %" PetscInt_FMT,Bcsr->num_entries,(PetscInt)Bcsr->values->size());
4631       PetscCheckFalse(Ccsr->num_entries != (PetscInt)Ccsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %" PetscInt_FMT " != %" PetscInt_FMT,Ccsr->num_entries,(PetscInt)Ccsr->values->size());
4632       PetscCheckFalse(Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries,PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT,Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries);
4633       PetscCheck(Ccusp->cooPerm->size() == Ccsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %" PetscInt_FMT " != %" PetscInt_FMT,(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size());
4634       auto pmid = Ccusp->cooPerm->begin();
4635       thrust::advance(pmid,Acsr->num_entries);
4636       CHKERRQ(PetscLogGpuTimeBegin());
4637       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(),
4638                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin())));
4639       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(),
4640                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4641       thrust::for_each(zibait,zieait,VecCUDAEquals());
4642       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(),
4643                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4644       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(),
4645                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end())));
4646       thrust::for_each(zibbit,ziebit,VecCUDAEquals());
4647       CHKERRQ(MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE));
4648       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
4649         PetscCheck(Ccusp->matTranspose,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4650         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4651         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4652         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4653         CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat;
4654         auto vT = CcsrT->values->begin();
4655         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4656         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4657         (*C)->transupdated = PETSC_TRUE;
4658       }
4659       CHKERRQ(PetscLogGpuTimeEnd());
4660     }
4661   }
4662   CHKERRQ(PetscObjectStateIncrease((PetscObject)*C));
4663   (*C)->assembled     = PETSC_TRUE;
4664   (*C)->was_assembled = PETSC_FALSE;
4665   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4666   PetscFunctionReturn(0);
4667 }
4668 
4669 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4670 {
4671   bool              dmem;
4672   const PetscScalar *av;
4673 
4674   PetscFunctionBegin;
4675   dmem = isCudaMem(v);
4676   CHKERRQ(MatSeqAIJCUSPARSEGetArrayRead(A,&av));
4677   if (n && idx) {
4678     THRUSTINTARRAY widx(n);
4679     widx.assign(idx,idx+n);
4680     CHKERRQ(PetscLogCpuToGpu(n*sizeof(PetscInt)));
4681 
4682     THRUSTARRAY *w = NULL;
4683     thrust::device_ptr<PetscScalar> dv;
4684     if (dmem) {
4685       dv = thrust::device_pointer_cast(v);
4686     } else {
4687       w = new THRUSTARRAY(n);
4688       dv = w->data();
4689     }
4690     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
4691 
4692     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv));
4693     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n));
4694     thrust::for_each(zibit,zieit,VecCUDAEquals());
4695     if (w) {
4696       CHKERRCUDA(cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost));
4697     }
4698     delete w;
4699   } else {
4700     CHKERRCUDA(cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost));
4701   }
4702   if (!dmem) CHKERRQ(PetscLogCpuToGpu(n*sizeof(PetscScalar)));
4703   CHKERRQ(MatSeqAIJCUSPARSERestoreArrayRead(A,&av));
4704   PetscFunctionReturn(0);
4705 }
4706