xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision ffa8c5705e8ab2cf85ee1d14dbe507a6e2eb5283)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
7 
8 #include <petscconf.h>
9 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
10 #include <../src/mat/impls/sbaij/seq/sbaij.h>
11 #include <../src/vec/vec/impls/dvecimpl.h>
12 #include <petsc/private/vecimpl.h>
13 #undef VecType
14 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
15 #include <thrust/adjacent_difference.h>
16 #include <thrust/async/for_each.h>
17 #include <thrust/iterator/constant_iterator.h>
18 #include <thrust/remove.h>
19 #include <thrust/sort.h>
20 #include <thrust/unique.h>
21 
22 const char *const MatCUSPARSEStorageFormats[]    = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
23 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
24   /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
25     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
26 
27   typedef enum {
28       CUSPARSE_MV_ALG_DEFAULT = 0,
29       CUSPARSE_COOMV_ALG      = 1,
30       CUSPARSE_CSRMV_ALG1     = 2,
31       CUSPARSE_CSRMV_ALG2     = 3
32   } cusparseSpMVAlg_t;
33 
34   typedef enum {
35       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
36       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
37       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
38       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
39       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
40       CUSPARSE_SPMM_ALG_DEFAULT = 0,
41       CUSPARSE_SPMM_COO_ALG1    = 1,
42       CUSPARSE_SPMM_COO_ALG2    = 2,
43       CUSPARSE_SPMM_COO_ALG3    = 3,
44       CUSPARSE_SPMM_COO_ALG4    = 5,
45       CUSPARSE_SPMM_CSR_ALG1    = 4,
46       CUSPARSE_SPMM_CSR_ALG2    = 6,
47   } cusparseSpMMAlg_t;
48 
49   typedef enum {
50       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
51       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
52   } cusparseCsr2CscAlg_t;
53   */
54   const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
55   const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
56   const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
57 #endif
58 
59 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
60 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
61 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
62 
63 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
64 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
65 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
66 
67 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
68 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
69 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
70 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
71 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
72 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure);
73 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar);
74 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
75 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
76 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
77 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
78 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
79 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
80 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
81 
82 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
83 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
84 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
85 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
86 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
87 
88 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
89 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool);
90 
91 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]);
92 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscCount,const PetscInt[],const PetscInt[]);
93 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode);
94 
95 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
96 {
97   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
98 
99   PetscFunctionBegin;
100   PetscCheck(cusparsestruct,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
101   cusparsestruct->stream = stream;
102   PetscCallCUSPARSE(cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream));
103   PetscFunctionReturn(0);
104 }
105 
106 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
107 {
108   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
109 
110   PetscFunctionBegin;
111   PetscCheck(cusparsestruct,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
112   if (cusparsestruct->handle != handle) {
113     if (cusparsestruct->handle) {
114       PetscCallCUSPARSE(cusparseDestroy(cusparsestruct->handle));
115     }
116     cusparsestruct->handle = handle;
117   }
118   PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE));
119   PetscFunctionReturn(0);
120 }
121 
122 PetscErrorCode MatCUSPARSEClearHandle(Mat A)
123 {
124   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
125   PetscBool          flg;
126 
127   PetscFunctionBegin;
128   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
129   if (!flg || !cusparsestruct) PetscFunctionReturn(0);
130   if (cusparsestruct->handle) cusparsestruct->handle = 0;
131   PetscFunctionReturn(0);
132 }
133 
134 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
135 {
136   PetscFunctionBegin;
137   *type = MATSOLVERCUSPARSE;
138   PetscFunctionReturn(0);
139 }
140 
141 /*MC
142   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
143   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
144   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
145   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
146   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
147   algorithms are not recommended. This class does NOT support direct solver operations.
148 
149   Level: beginner
150 
151 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
152 M*/
153 
154 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
155 {
156   PetscInt       n = A->rmap->n;
157 
158   PetscFunctionBegin;
159   PetscCall(MatCreate(PetscObjectComm((PetscObject)A),B));
160   PetscCall(MatSetSizes(*B,n,n,n,n));
161   (*B)->factortype = ftype;
162   PetscCall(MatSetType(*B,MATSEQAIJCUSPARSE));
163 
164   if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B,PETSC_TRUE));
165   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
166     PetscCall(MatSetBlockSizesFromMats(*B,A,A));
167     if (!A->boundtocpu) {
168       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
169       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
170     } else {
171       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
172       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJ;
173     }
174     PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]));
175     PetscCall(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU]));
176     PetscCall(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
177   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
178     if (!A->boundtocpu) {
179       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
180       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
181     } else {
182       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJ;
183       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
184     }
185     PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
186     PetscCall(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC]));
187   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
188 
189   PetscCall(MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL));
190   (*B)->canuseordering = PETSC_TRUE;
191   PetscCall(PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse));
192   PetscFunctionReturn(0);
193 }
194 
195 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
196 {
197   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
198 
199   PetscFunctionBegin;
200   switch (op) {
201   case MAT_CUSPARSE_MULT:
202     cusparsestruct->format = format;
203     break;
204   case MAT_CUSPARSE_ALL:
205     cusparsestruct->format = format;
206     break;
207   default:
208     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
209   }
210   PetscFunctionReturn(0);
211 }
212 
213 /*@
214    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
215    operation. Only the MatMult operation can use different GPU storage formats
216    for MPIAIJCUSPARSE matrices.
217    Not Collective
218 
219    Input Parameters:
220 +  A - Matrix of type SEQAIJCUSPARSE
221 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
222 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
223 
224    Output Parameter:
225 
226    Level: intermediate
227 
228 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
229 @*/
230 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
231 {
232   PetscFunctionBegin;
233   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
234   PetscCall(PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format)));
235   PetscFunctionReturn(0);
236 }
237 
238 PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A,PetscBool use_cpu)
239 {
240   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
241 
242   PetscFunctionBegin;
243   cusparsestruct->use_cpu_solve = use_cpu;
244   PetscFunctionReturn(0);
245 }
246 
247 /*@
248    MatCUSPARSESetUseCPUSolve - Sets use CPU MatSolve.
249 
250    Input Parameters:
251 +  A - Matrix of type SEQAIJCUSPARSE
252 -  use_cpu - set flag for using the built-in CPU MatSolve
253 
254    Output Parameter:
255 
256    Notes:
257    The cuSparse LU solver currently computes the factors with the built-in CPU method
258    and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there.
259    This method to specify if the solve is done on the CPU or GPU (GPU is the default).
260 
261    Level: intermediate
262 
263 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
264 @*/
265 PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A,PetscBool use_cpu)
266 {
267   PetscFunctionBegin;
268   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
269   PetscCall(PetscTryMethod(A,"MatCUSPARSESetUseCPUSolve_C",(Mat,PetscBool),(A,use_cpu)));
270   PetscFunctionReturn(0);
271 }
272 
273 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg)
274 {
275   PetscFunctionBegin;
276   switch (op) {
277     case MAT_FORM_EXPLICIT_TRANSPOSE:
278       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
279       if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE));
280       A->form_explicit_transpose = flg;
281       break;
282     default:
283       PetscCall(MatSetOption_SeqAIJ(A,op,flg));
284       break;
285   }
286   PetscFunctionReturn(0);
287 }
288 
289 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A);
290 
291 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
292 {
293   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
294   IS             isrow = b->row,iscol = b->col;
295   PetscBool      row_identity,col_identity;
296   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)B->spptr;
297 
298   PetscFunctionBegin;
299   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
300   PetscCall(MatLUFactorNumeric_SeqAIJ(B,A,info));
301   B->offloadmask = PETSC_OFFLOAD_CPU;
302   /* determine which version of MatSolve needs to be used. */
303   PetscCall(ISIdentity(isrow,&row_identity));
304   PetscCall(ISIdentity(iscol,&col_identity));
305   if (row_identity && col_identity) {
306     if (!cusparsestruct->use_cpu_solve) {
307       B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
308       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
309     }
310     B->ops->matsolve = NULL;
311     B->ops->matsolvetranspose = NULL;
312   } else {
313     if (!cusparsestruct->use_cpu_solve) {
314       B->ops->solve = MatSolve_SeqAIJCUSPARSE;
315       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
316     }
317     B->ops->matsolve = NULL;
318     B->ops->matsolvetranspose = NULL;
319   }
320 
321   /* get the triangular factors */
322   if (!cusparsestruct->use_cpu_solve) {
323     PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B));
324   }
325   PetscFunctionReturn(0);
326 }
327 
328 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
329 {
330   PetscErrorCode           ierr;
331   MatCUSPARSEStorageFormat format;
332   PetscBool                flg;
333   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
334 
335   PetscFunctionBegin;
336   PetscCall(PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options"));
337   if (A->factortype == MAT_FACTOR_NONE) {
338     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
339                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);PetscCall(ierr);
340     if (flg) PetscCall(MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format));
341 
342     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
343                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);PetscCall(ierr);
344     if (flg) PetscCall(MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format));
345     PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve","Use CPU (I)LU solve","MatCUSPARSESetUseCPUSolve",cusparsestruct->use_cpu_solve,&cusparsestruct->use_cpu_solve,&flg));
346     if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A,cusparsestruct->use_cpu_solve));
347 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
348     ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
349                             "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);PetscCall(ierr);
350     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
351 #if PETSC_PKG_CUDA_VERSION_GE(11,2,0)
352     PetscCheckFalse(flg && CUSPARSE_SPMV_CSR_ALG1 != 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
353 #else
354     PetscCheckFalse(flg && CUSPARSE_CSRMV_ALG1 != 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
355 #endif
356     ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
357                             "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);PetscCall(ierr);
358     PetscCheckFalse(flg && CUSPARSE_SPMM_CSR_ALG1 != 4,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
359 
360     ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
361                             "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);PetscCall(ierr);
362     PetscCheckFalse(flg && CUSPARSE_CSR2CSC_ALG1 != 1,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
363    #endif
364   }
365   PetscCall(PetscOptionsTail());
366   PetscFunctionReturn(0);
367 }
368 
369 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
370 {
371   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
372 
373   PetscFunctionBegin;
374   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
375   PetscCall(MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info));
376   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
377   PetscFunctionReturn(0);
378 }
379 
380 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
381 {
382   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
383 
384   PetscFunctionBegin;
385   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
386   PetscCall(MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info));
387   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
388   PetscFunctionReturn(0);
389 }
390 
391 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
392 {
393   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
394 
395   PetscFunctionBegin;
396   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
397   PetscCall(MatICCFactorSymbolic_SeqAIJ(B,A,perm,info));
398   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
399   PetscFunctionReturn(0);
400 }
401 
402 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
403 {
404   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
405 
406   PetscFunctionBegin;
407   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
408   PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info));
409   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
410   PetscFunctionReturn(0);
411 }
412 
413 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
414 {
415   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
416   PetscInt                          n = A->rmap->n;
417   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
418   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
419   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
420   const MatScalar                   *aa = a->a,*v;
421   PetscInt                          *AiLo, *AjLo;
422   PetscInt                          i,nz, nzLower, offset, rowOffset;
423 
424   PetscFunctionBegin;
425   if (!n) PetscFunctionReturn(0);
426   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
427     try {
428       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
429       nzLower=n+ai[n]-ai[1];
430       if (!loTriFactor) {
431         PetscScalar                       *AALo;
432 
433         PetscCallCUDA(cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar)));
434 
435         /* Allocate Space for the lower triangular matrix */
436         PetscCallCUDA(cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt)));
437         PetscCallCUDA(cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt)));
438 
439         /* Fill the lower triangular matrix */
440         AiLo[0]  = (PetscInt) 0;
441         AiLo[n]  = nzLower;
442         AjLo[0]  = (PetscInt) 0;
443         AALo[0]  = (MatScalar) 1.0;
444         v        = aa;
445         vi       = aj;
446         offset   = 1;
447         rowOffset= 1;
448         for (i=1; i<n; i++) {
449           nz = ai[i+1] - ai[i];
450           /* additional 1 for the term on the diagonal */
451           AiLo[i]    = rowOffset;
452           rowOffset += nz+1;
453 
454           PetscCall(PetscArraycpy(&(AjLo[offset]), vi, nz));
455           PetscCall(PetscArraycpy(&(AALo[offset]), v, nz));
456 
457           offset      += nz;
458           AjLo[offset] = (PetscInt) i;
459           AALo[offset] = (MatScalar) 1.0;
460           offset      += 1;
461 
462           v  += nz;
463           vi += nz;
464         }
465 
466         /* allocate space for the triangular factor information */
467         PetscCall(PetscNew(&loTriFactor));
468         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
469         /* Create the matrix description */
470         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
471         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
472        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
473         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
474        #else
475         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
476        #endif
477         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER));
478         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
479 
480         /* set the operation */
481         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
482 
483         /* set the matrix */
484         loTriFactor->csrMat = new CsrMatrix;
485         loTriFactor->csrMat->num_rows = n;
486         loTriFactor->csrMat->num_cols = n;
487         loTriFactor->csrMat->num_entries = nzLower;
488 
489         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
490         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
491 
492         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
493         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
494 
495         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
496         loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
497 
498         /* Create the solve analysis information */
499         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
500         PetscCallCUSPARSE(cusparse_create_analysis_info(&loTriFactor->solveInfo));
501       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
502         PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
503                                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
504                                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
505                                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
506                                                &loTriFactor->solveBufferSize));
507         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize));
508       #endif
509 
510         /* perform the solve analysis */
511         PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
512                                          loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
513                                          loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
514                                          loTriFactor->csrMat->column_indices->data().get(),
515                                          #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
516                                          loTriFactor->solveInfo,
517                                          loTriFactor->solvePolicy, loTriFactor->solveBuffer));
518                                          #else
519                                          loTriFactor->solveInfo));
520                                          #endif
521         PetscCallCUDA(WaitForCUDA());
522         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
523 
524         /* assign the pointer */
525         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
526         loTriFactor->AA_h = AALo;
527         PetscCallCUDA(cudaFreeHost(AiLo));
528         PetscCallCUDA(cudaFreeHost(AjLo));
529         PetscCall(PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar)));
530       } else { /* update values only */
531         if (!loTriFactor->AA_h) {
532           PetscCallCUDA(cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar)));
533         }
534         /* Fill the lower triangular matrix */
535         loTriFactor->AA_h[0]  = 1.0;
536         v        = aa;
537         vi       = aj;
538         offset   = 1;
539         for (i=1; i<n; i++) {
540           nz = ai[i+1] - ai[i];
541           PetscCall(PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz));
542           offset      += nz;
543           loTriFactor->AA_h[offset] = 1.0;
544           offset      += 1;
545           v  += nz;
546         }
547         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower);
548         PetscCall(PetscLogCpuToGpu(nzLower*sizeof(PetscScalar)));
549       }
550     } catch(char *ex) {
551       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
552     }
553   }
554   PetscFunctionReturn(0);
555 }
556 
557 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
558 {
559   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
560   PetscInt                          n = A->rmap->n;
561   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
562   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
563   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
564   const MatScalar                   *aa = a->a,*v;
565   PetscInt                          *AiUp, *AjUp;
566   PetscInt                          i,nz, nzUpper, offset;
567 
568   PetscFunctionBegin;
569   if (!n) PetscFunctionReturn(0);
570   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
571     try {
572       /* next, figure out the number of nonzeros in the upper triangular matrix. */
573       nzUpper = adiag[0]-adiag[n];
574       if (!upTriFactor) {
575         PetscScalar *AAUp;
576 
577         PetscCallCUDA(cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar)));
578 
579         /* Allocate Space for the upper triangular matrix */
580         PetscCallCUDA(cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt)));
581         PetscCallCUDA(cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt)));
582 
583         /* Fill the upper triangular matrix */
584         AiUp[0]=(PetscInt) 0;
585         AiUp[n]=nzUpper;
586         offset = nzUpper;
587         for (i=n-1; i>=0; i--) {
588           v  = aa + adiag[i+1] + 1;
589           vi = aj + adiag[i+1] + 1;
590 
591           /* number of elements NOT on the diagonal */
592           nz = adiag[i] - adiag[i+1]-1;
593 
594           /* decrement the offset */
595           offset -= (nz+1);
596 
597           /* first, set the diagonal elements */
598           AjUp[offset] = (PetscInt) i;
599           AAUp[offset] = (MatScalar)1./v[nz];
600           AiUp[i]      = AiUp[i+1] - (nz+1);
601 
602           PetscCall(PetscArraycpy(&(AjUp[offset+1]), vi, nz));
603           PetscCall(PetscArraycpy(&(AAUp[offset+1]), v, nz));
604         }
605 
606         /* allocate space for the triangular factor information */
607         PetscCall(PetscNew(&upTriFactor));
608         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
609 
610         /* Create the matrix description */
611         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
612         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
613        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
614         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
615        #else
616         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
617        #endif
618         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
619         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
620 
621         /* set the operation */
622         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
623 
624         /* set the matrix */
625         upTriFactor->csrMat = new CsrMatrix;
626         upTriFactor->csrMat->num_rows = n;
627         upTriFactor->csrMat->num_cols = n;
628         upTriFactor->csrMat->num_entries = nzUpper;
629 
630         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
631         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
632 
633         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
634         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
635 
636         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
637         upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
638 
639         /* Create the solve analysis information */
640         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
641         PetscCallCUSPARSE(cusparse_create_analysis_info(&upTriFactor->solveInfo));
642       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
643         PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
644                                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
645                                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
646                                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
647                                                &upTriFactor->solveBufferSize));
648         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize));
649       #endif
650 
651         /* perform the solve analysis */
652         PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
653                                          upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
654                                          upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
655                                          upTriFactor->csrMat->column_indices->data().get(),
656                                          #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
657                                          upTriFactor->solveInfo,
658                                          upTriFactor->solvePolicy, upTriFactor->solveBuffer));
659                                          #else
660                                          upTriFactor->solveInfo));
661                                          #endif
662         PetscCallCUDA(WaitForCUDA());
663         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
664 
665         /* assign the pointer */
666         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
667         upTriFactor->AA_h = AAUp;
668         PetscCallCUDA(cudaFreeHost(AiUp));
669         PetscCallCUDA(cudaFreeHost(AjUp));
670         PetscCall(PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar)));
671       } else {
672         if (!upTriFactor->AA_h) {
673           PetscCallCUDA(cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar)));
674         }
675         /* Fill the upper triangular matrix */
676         offset = nzUpper;
677         for (i=n-1; i>=0; i--) {
678           v  = aa + adiag[i+1] + 1;
679 
680           /* number of elements NOT on the diagonal */
681           nz = adiag[i] - adiag[i+1]-1;
682 
683           /* decrement the offset */
684           offset -= (nz+1);
685 
686           /* first, set the diagonal elements */
687           upTriFactor->AA_h[offset] = 1./v[nz];
688           PetscCall(PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz));
689         }
690         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper);
691         PetscCall(PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar)));
692       }
693     } catch(char *ex) {
694       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
695     }
696   }
697   PetscFunctionReturn(0);
698 }
699 
700 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
701 {
702   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
703   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
704   IS                           isrow = a->row,iscol = a->icol;
705   PetscBool                    row_identity,col_identity;
706   PetscInt                     n = A->rmap->n;
707 
708   PetscFunctionBegin;
709   PetscCheck(cusparseTriFactors,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
710   PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A));
711   PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A));
712 
713   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
714   cusparseTriFactors->nnz=a->nz;
715 
716   A->offloadmask = PETSC_OFFLOAD_BOTH;
717   /* lower triangular indices */
718   PetscCall(ISIdentity(isrow,&row_identity));
719   if (!row_identity && !cusparseTriFactors->rpermIndices) {
720     const PetscInt *r;
721 
722     PetscCall(ISGetIndices(isrow,&r));
723     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
724     cusparseTriFactors->rpermIndices->assign(r, r+n);
725     PetscCall(ISRestoreIndices(isrow,&r));
726     PetscCall(PetscLogCpuToGpu(n*sizeof(PetscInt)));
727   }
728 
729   /* upper triangular indices */
730   PetscCall(ISIdentity(iscol,&col_identity));
731   if (!col_identity && !cusparseTriFactors->cpermIndices) {
732     const PetscInt *c;
733 
734     PetscCall(ISGetIndices(iscol,&c));
735     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
736     cusparseTriFactors->cpermIndices->assign(c, c+n);
737     PetscCall(ISRestoreIndices(iscol,&c));
738     PetscCall(PetscLogCpuToGpu(n*sizeof(PetscInt)));
739   }
740   PetscFunctionReturn(0);
741 }
742 
743 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
744 {
745   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
746   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
747   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
748   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
749   PetscInt                          *AiUp, *AjUp;
750   PetscScalar                       *AAUp;
751   PetscScalar                       *AALo;
752   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
753   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
754   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
755   const MatScalar                   *aa = b->a,*v;
756 
757   PetscFunctionBegin;
758   if (!n) PetscFunctionReturn(0);
759   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
760     try {
761       PetscCallCUDA(cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar)));
762       PetscCallCUDA(cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar)));
763       if (!upTriFactor && !loTriFactor) {
764         /* Allocate Space for the upper triangular matrix */
765         PetscCallCUDA(cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt)));
766         PetscCallCUDA(cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt)));
767 
768         /* Fill the upper triangular matrix */
769         AiUp[0]=(PetscInt) 0;
770         AiUp[n]=nzUpper;
771         offset = 0;
772         for (i=0; i<n; i++) {
773           /* set the pointers */
774           v  = aa + ai[i];
775           vj = aj + ai[i];
776           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
777 
778           /* first, set the diagonal elements */
779           AjUp[offset] = (PetscInt) i;
780           AAUp[offset] = (MatScalar)1.0/v[nz];
781           AiUp[i]      = offset;
782           AALo[offset] = (MatScalar)1.0/v[nz];
783 
784           offset+=1;
785           if (nz>0) {
786             PetscCall(PetscArraycpy(&(AjUp[offset]), vj, nz));
787             PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz));
788             for (j=offset; j<offset+nz; j++) {
789               AAUp[j] = -AAUp[j];
790               AALo[j] = AAUp[j]/v[nz];
791             }
792             offset+=nz;
793           }
794         }
795 
796         /* allocate space for the triangular factor information */
797         PetscCall(PetscNew(&upTriFactor));
798         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
799 
800         /* Create the matrix description */
801         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
802         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
803        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
804         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
805        #else
806         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
807        #endif
808         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
809         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
810 
811         /* set the matrix */
812         upTriFactor->csrMat = new CsrMatrix;
813         upTriFactor->csrMat->num_rows = A->rmap->n;
814         upTriFactor->csrMat->num_cols = A->cmap->n;
815         upTriFactor->csrMat->num_entries = a->nz;
816 
817         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
818         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
819 
820         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
821         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
822 
823         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
824         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
825 
826         /* set the operation */
827         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
828 
829         /* Create the solve analysis information */
830         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
831         PetscCallCUSPARSE(cusparse_create_analysis_info(&upTriFactor->solveInfo));
832       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
833         PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
834                                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
835                                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
836                                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
837                                                &upTriFactor->solveBufferSize));
838         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize));
839       #endif
840 
841         /* perform the solve analysis */
842         PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
843                                          upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
844                                          upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
845                                          upTriFactor->csrMat->column_indices->data().get(),
846                                          #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
847                                          upTriFactor->solveInfo,
848                                          upTriFactor->solvePolicy, upTriFactor->solveBuffer));
849                                          #else
850                                          upTriFactor->solveInfo));
851                                          #endif
852         PetscCallCUDA(WaitForCUDA());
853         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
854 
855         /* assign the pointer */
856         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
857 
858         /* allocate space for the triangular factor information */
859         PetscCall(PetscNew(&loTriFactor));
860         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
861 
862         /* Create the matrix description */
863         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
864         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
865        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
866         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
867        #else
868         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
869        #endif
870         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
871         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
872 
873         /* set the operation */
874         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
875 
876         /* set the matrix */
877         loTriFactor->csrMat = new CsrMatrix;
878         loTriFactor->csrMat->num_rows = A->rmap->n;
879         loTriFactor->csrMat->num_cols = A->cmap->n;
880         loTriFactor->csrMat->num_entries = a->nz;
881 
882         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
883         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
884 
885         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
886         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
887 
888         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
889         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
890 
891         /* Create the solve analysis information */
892         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
893         PetscCallCUSPARSE(cusparse_create_analysis_info(&loTriFactor->solveInfo));
894       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
895         PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
896                                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
897                                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
898                                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
899                                                &loTriFactor->solveBufferSize));
900         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize));
901       #endif
902 
903         /* perform the solve analysis */
904         PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
905                                          loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
906                                          loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
907                                          loTriFactor->csrMat->column_indices->data().get(),
908                                          #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
909                                          loTriFactor->solveInfo,
910                                          loTriFactor->solvePolicy, loTriFactor->solveBuffer));
911                                          #else
912                                          loTriFactor->solveInfo));
913                                          #endif
914         PetscCallCUDA(WaitForCUDA());
915         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
916 
917         /* assign the pointer */
918         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
919 
920         PetscCall(PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar))));
921         PetscCallCUDA(cudaFreeHost(AiUp));
922         PetscCallCUDA(cudaFreeHost(AjUp));
923       } else {
924         /* Fill the upper triangular matrix */
925         offset = 0;
926         for (i=0; i<n; i++) {
927           /* set the pointers */
928           v  = aa + ai[i];
929           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
930 
931           /* first, set the diagonal elements */
932           AAUp[offset] = 1.0/v[nz];
933           AALo[offset] = 1.0/v[nz];
934 
935           offset+=1;
936           if (nz>0) {
937             PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz));
938             for (j=offset; j<offset+nz; j++) {
939               AAUp[j] = -AAUp[j];
940               AALo[j] = AAUp[j]/v[nz];
941             }
942             offset+=nz;
943           }
944         }
945         PetscCheck(upTriFactor,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
946         PetscCheck(loTriFactor,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
947         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
948         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
949         PetscCall(PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar)));
950       }
951       PetscCallCUDA(cudaFreeHost(AAUp));
952       PetscCallCUDA(cudaFreeHost(AALo));
953     } catch(char *ex) {
954       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
955     }
956   }
957   PetscFunctionReturn(0);
958 }
959 
960 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
961 {
962   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
963   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
964   IS                           ip = a->row;
965   PetscBool                    perm_identity;
966   PetscInt                     n = A->rmap->n;
967 
968   PetscFunctionBegin;
969   PetscCheck(cusparseTriFactors,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
970   PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A));
971   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
972   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
973 
974   A->offloadmask = PETSC_OFFLOAD_BOTH;
975 
976   /* lower triangular indices */
977   PetscCall(ISIdentity(ip,&perm_identity));
978   if (!perm_identity) {
979     IS             iip;
980     const PetscInt *irip,*rip;
981 
982     PetscCall(ISInvertPermutation(ip,PETSC_DECIDE,&iip));
983     PetscCall(ISGetIndices(iip,&irip));
984     PetscCall(ISGetIndices(ip,&rip));
985     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
986     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
987     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
988     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
989     PetscCall(ISRestoreIndices(iip,&irip));
990     PetscCall(ISDestroy(&iip));
991     PetscCall(ISRestoreIndices(ip,&rip));
992     PetscCall(PetscLogCpuToGpu(2.*n*sizeof(PetscInt)));
993   }
994   PetscFunctionReturn(0);
995 }
996 
997 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
998 {
999   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
1000   IS             ip = b->row;
1001   PetscBool      perm_identity;
1002 
1003   PetscFunctionBegin;
1004   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
1005   PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B,A,info));
1006   B->offloadmask = PETSC_OFFLOAD_CPU;
1007   /* determine which version of MatSolve needs to be used. */
1008   PetscCall(ISIdentity(ip,&perm_identity));
1009   if (perm_identity) {
1010     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
1011     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
1012     B->ops->matsolve = NULL;
1013     B->ops->matsolvetranspose = NULL;
1014   } else {
1015     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
1016     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
1017     B->ops->matsolve = NULL;
1018     B->ops->matsolvetranspose = NULL;
1019   }
1020 
1021   /* get the triangular factors */
1022   PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B));
1023   PetscFunctionReturn(0);
1024 }
1025 
1026 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
1027 {
1028   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1029   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1030   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1031   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
1032   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
1033   cusparseIndexBase_t               indexBase;
1034   cusparseMatrixType_t              matrixType;
1035   cusparseFillMode_t                fillMode;
1036   cusparseDiagType_t                diagType;
1037 
1038   PetscFunctionBegin;
1039   /* allocate space for the transpose of the lower triangular factor */
1040   PetscCall(PetscNew(&loTriFactorT));
1041   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1042 
1043   /* set the matrix descriptors of the lower triangular factor */
1044   matrixType = cusparseGetMatType(loTriFactor->descr);
1045   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
1046   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1047     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1048   diagType = cusparseGetMatDiagType(loTriFactor->descr);
1049 
1050   /* Create the matrix description */
1051   PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr));
1052   PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase));
1053   PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType));
1054   PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode));
1055   PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType));
1056 
1057   /* set the operation */
1058   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1059 
1060   /* allocate GPU space for the CSC of the lower triangular factor*/
1061   loTriFactorT->csrMat = new CsrMatrix;
1062   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1063   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1064   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1065   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
1066   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1067   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1068 
1069   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1070 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1071   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1072                                                loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1073                                                loTriFactor->csrMat->values->data().get(),
1074                                                loTriFactor->csrMat->row_offsets->data().get(),
1075                                                loTriFactor->csrMat->column_indices->data().get(),
1076                                                loTriFactorT->csrMat->values->data().get(),
1077                                                loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1078                                                CUSPARSE_ACTION_NUMERIC,indexBase,
1079                                                CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize));
1080   PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize));
1081 #endif
1082 
1083   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
1084   PetscCallCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1085                                   loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1086                                   loTriFactor->csrMat->values->data().get(),
1087                                   loTriFactor->csrMat->row_offsets->data().get(),
1088                                   loTriFactor->csrMat->column_indices->data().get(),
1089                                   loTriFactorT->csrMat->values->data().get(),
1090                                   #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1091                                   loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1092                                   CUSPARSE_ACTION_NUMERIC, indexBase,
1093                                   CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer));
1094                                   #else
1095                                   loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1096                                   CUSPARSE_ACTION_NUMERIC, indexBase));
1097                                   #endif
1098   PetscCallCUDA(WaitForCUDA());
1099   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
1100 
1101   /* Create the solve analysis information */
1102   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
1103   PetscCallCUSPARSE(cusparse_create_analysis_info(&loTriFactorT->solveInfo));
1104 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1105   PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
1106                                          loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1107                                          loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1108                                          loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
1109                                          &loTriFactorT->solveBufferSize));
1110   PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize));
1111 #endif
1112 
1113   /* perform the solve analysis */
1114   PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
1115                                    loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1116                                    loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1117                                    loTriFactorT->csrMat->column_indices->data().get(),
1118                                    #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1119                                    loTriFactorT->solveInfo,
1120                                    loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1121                                    #else
1122                                    loTriFactorT->solveInfo));
1123                                    #endif
1124   PetscCallCUDA(WaitForCUDA());
1125   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
1126 
1127   /* assign the pointer */
1128   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1129 
1130   /*********************************************/
1131   /* Now the Transpose of the Upper Tri Factor */
1132   /*********************************************/
1133 
1134   /* allocate space for the transpose of the upper triangular factor */
1135   PetscCall(PetscNew(&upTriFactorT));
1136   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1137 
1138   /* set the matrix descriptors of the upper triangular factor */
1139   matrixType = cusparseGetMatType(upTriFactor->descr);
1140   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1141   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1142     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1143   diagType = cusparseGetMatDiagType(upTriFactor->descr);
1144 
1145   /* Create the matrix description */
1146   PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr));
1147   PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase));
1148   PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType));
1149   PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode));
1150   PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType));
1151 
1152   /* set the operation */
1153   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1154 
1155   /* allocate GPU space for the CSC of the upper triangular factor*/
1156   upTriFactorT->csrMat = new CsrMatrix;
1157   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1158   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1159   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1160   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1161   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1162   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1163 
1164   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1165 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1166   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1167                                                upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1168                                                upTriFactor->csrMat->values->data().get(),
1169                                                upTriFactor->csrMat->row_offsets->data().get(),
1170                                                upTriFactor->csrMat->column_indices->data().get(),
1171                                                upTriFactorT->csrMat->values->data().get(),
1172                                                upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1173                                                CUSPARSE_ACTION_NUMERIC,indexBase,
1174                                                CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize));
1175   PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize));
1176 #endif
1177 
1178   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
1179   PetscCallCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1180                                   upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1181                                   upTriFactor->csrMat->values->data().get(),
1182                                   upTriFactor->csrMat->row_offsets->data().get(),
1183                                   upTriFactor->csrMat->column_indices->data().get(),
1184                                   upTriFactorT->csrMat->values->data().get(),
1185                                   #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1186                                   upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1187                                   CUSPARSE_ACTION_NUMERIC, indexBase,
1188                                   CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer));
1189                                   #else
1190                                   upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1191                                  CUSPARSE_ACTION_NUMERIC, indexBase));
1192                                  #endif
1193 
1194   PetscCallCUDA(WaitForCUDA());
1195   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
1196 
1197   /* Create the solve analysis information */
1198   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
1199   PetscCallCUSPARSE(cusparse_create_analysis_info(&upTriFactorT->solveInfo));
1200   #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1201   PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1202                                          upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1203                                          upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1204                                          upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
1205                                          &upTriFactorT->solveBufferSize));
1206   PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize));
1207   #endif
1208 
1209   /* perform the solve analysis */
1210   /* christ, would it have killed you to put this stuff in a function????????? */
1211   PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1212                                    upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1213                                    upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1214                                    upTriFactorT->csrMat->column_indices->data().get(),
1215                                    #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1216                                    upTriFactorT->solveInfo,
1217                                    upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1218                                    #else
1219                                    upTriFactorT->solveInfo));
1220                                    #endif
1221 
1222   PetscCallCUDA(WaitForCUDA());
1223   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
1224 
1225   /* assign the pointer */
1226   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1227   PetscFunctionReturn(0);
1228 }
1229 
1230 struct PetscScalarToPetscInt
1231 {
1232   __host__ __device__
1233   PetscInt operator()(PetscScalar s)
1234   {
1235     return (PetscInt)PetscRealPart(s);
1236   }
1237 };
1238 
1239 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A)
1240 {
1241   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1242   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1243   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1244   cusparseStatus_t             stat;
1245   cusparseIndexBase_t          indexBase;
1246 
1247   PetscFunctionBegin;
1248   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1249   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1250   PetscCheck(matstruct,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing mat struct");
1251   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1252   PetscCheckFalse(A->transupdated && !matstructT,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing matTranspose struct");
1253   if (A->transupdated) PetscFunctionReturn(0);
1254   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
1255   PetscCall(PetscLogGpuTimeBegin());
1256   if (cusparsestruct->format != MAT_CUSPARSE_CSR) {
1257     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE));
1258   }
1259   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1260     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1261     PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr));
1262     indexBase = cusparseGetMatIndexBase(matstruct->descr);
1263     PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase));
1264     PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
1265 
1266     /* set alpha and beta */
1267     PetscCallCUDA(cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar)));
1268     PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar)));
1269     PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar)));
1270     PetscCallCUDA(cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
1271     PetscCallCUDA(cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice));
1272     PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
1273 
1274     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1275       CsrMatrix *matrixT = new CsrMatrix;
1276       matstructT->mat = matrixT;
1277       matrixT->num_rows = A->cmap->n;
1278       matrixT->num_cols = A->rmap->n;
1279       matrixT->num_entries = a->nz;
1280       matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1281       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1282       matrixT->values = new THRUSTARRAY(a->nz);
1283 
1284       if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); }
1285       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
1286 
1287      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1288       #if PETSC_PKG_CUDA_VERSION_GE(11,2,1)
1289         stat = cusparseCreateCsr(&matstructT->matDescr,
1290                                matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1291                                matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1292                                matrixT->values->data().get(),
1293                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1294                                indexBase,cusparse_scalartype);PetscCallCUSPARSE(stat);
1295       #else
1296         /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1,
1297            see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1
1298 
1299            I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set
1300            it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2,
1301            when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly.
1302         */
1303         if (matrixT->num_entries) {
1304           stat = cusparseCreateCsr(&matstructT->matDescr,
1305                                  matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1306                                  matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1307                                  matrixT->values->data().get(),
1308                                  CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I,
1309                                  indexBase,cusparse_scalartype);PetscCallCUSPARSE(stat);
1310 
1311         } else {
1312           matstructT->matDescr = NULL;
1313           matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
1314         }
1315       #endif
1316      #endif
1317     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1318    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1319       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1320    #else
1321       CsrMatrix *temp  = new CsrMatrix;
1322       CsrMatrix *tempT = new CsrMatrix;
1323       /* First convert HYB to CSR */
1324       temp->num_rows = A->rmap->n;
1325       temp->num_cols = A->cmap->n;
1326       temp->num_entries = a->nz;
1327       temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1328       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1329       temp->values = new THRUSTARRAY(a->nz);
1330 
1331       stat = cusparse_hyb2csr(cusparsestruct->handle,
1332                               matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1333                               temp->values->data().get(),
1334                               temp->row_offsets->data().get(),
1335                               temp->column_indices->data().get());PetscCallCUSPARSE(stat);
1336 
1337       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1338       tempT->num_rows = A->rmap->n;
1339       tempT->num_cols = A->cmap->n;
1340       tempT->num_entries = a->nz;
1341       tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1342       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1343       tempT->values = new THRUSTARRAY(a->nz);
1344 
1345       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1346                               temp->num_cols, temp->num_entries,
1347                               temp->values->data().get(),
1348                               temp->row_offsets->data().get(),
1349                               temp->column_indices->data().get(),
1350                               tempT->values->data().get(),
1351                               tempT->column_indices->data().get(),
1352                               tempT->row_offsets->data().get(),
1353                               CUSPARSE_ACTION_NUMERIC, indexBase);PetscCallCUSPARSE(stat);
1354 
1355       /* Last, convert CSC to HYB */
1356       cusparseHybMat_t hybMat;
1357       PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
1358       cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1359         CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1360       stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1361                               matstructT->descr, tempT->values->data().get(),
1362                               tempT->row_offsets->data().get(),
1363                               tempT->column_indices->data().get(),
1364                               hybMat, 0, partition);PetscCallCUSPARSE(stat);
1365 
1366       /* assign the pointer */
1367       matstructT->mat = hybMat;
1368       A->transupdated = PETSC_TRUE;
1369       /* delete temporaries */
1370       if (tempT) {
1371         if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1372         if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1373         if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1374         delete (CsrMatrix*) tempT;
1375       }
1376       if (temp) {
1377         if (temp->values) delete (THRUSTARRAY*) temp->values;
1378         if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1379         if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1380         delete (CsrMatrix*) temp;
1381       }
1382      #endif
1383     }
1384   }
1385   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1386     CsrMatrix *matrix  = (CsrMatrix*)matstruct->mat;
1387     CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat;
1388     PetscCheck(matrix,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix");
1389     PetscCheck(matrix->row_offsets,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix rows");
1390     PetscCheck(matrix->column_indices,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix cols");
1391     PetscCheck(matrix->values,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix values");
1392     PetscCheck(matrixT,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT");
1393     PetscCheck(matrixT->row_offsets,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT rows");
1394     PetscCheck(matrixT->column_indices,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT cols");
1395     PetscCheck(matrixT->values,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT values");
1396     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1397       cusparsestruct->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
1398       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
1399       PetscCall(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt)));
1400     }
1401     if (!cusparsestruct->csr2csc_i) {
1402       THRUSTARRAY csr2csc_a(matrix->num_entries);
1403       PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1404 
1405       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1406      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1407       void   *csr2cscBuffer;
1408       size_t csr2cscBufferSize;
1409       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1410                                            A->cmap->n, matrix->num_entries,
1411                                            matrix->values->data().get(),
1412                                            cusparsestruct->rowoffsets_gpu->data().get(),
1413                                            matrix->column_indices->data().get(),
1414                                            matrixT->values->data().get(),
1415                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1416                                            CUSPARSE_ACTION_NUMERIC,indexBase,
1417                                            cusparsestruct->csr2cscAlg, &csr2cscBufferSize);PetscCallCUSPARSE(stat);
1418       PetscCallCUDA(cudaMalloc(&csr2cscBuffer,csr2cscBufferSize));
1419      #endif
1420 
1421       if (matrix->num_entries) {
1422         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
1423            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
1424            I checked every parameters and they were just fine. I have no clue why cusparse complains.
1425 
1426            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
1427            should be filled with indexBase. So I just take a shortcut here.
1428         */
1429         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1430                               A->cmap->n,matrix->num_entries,
1431                               csr2csc_a.data().get(),
1432                               cusparsestruct->rowoffsets_gpu->data().get(),
1433                               matrix->column_indices->data().get(),
1434                               matrixT->values->data().get(),
1435                              #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1436                               matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1437                               CUSPARSE_ACTION_NUMERIC,indexBase,
1438                               cusparsestruct->csr2cscAlg, csr2cscBuffer);PetscCallCUSPARSE(stat);
1439                              #else
1440                               matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1441                               CUSPARSE_ACTION_NUMERIC, indexBase);PetscCallCUSPARSE(stat);
1442                              #endif
1443       } else {
1444         matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
1445       }
1446 
1447       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1448       PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt()));
1449      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1450       PetscCallCUDA(cudaFree(csr2cscBuffer));
1451      #endif
1452     }
1453     PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()),
1454                                                      thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()),
1455                                                      matrixT->values->begin()));
1456   }
1457   PetscCall(PetscLogGpuTimeEnd());
1458   PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
1459   /* the compressed row indices is not used for matTranspose */
1460   matstructT->cprowIndices = NULL;
1461   /* assign the pointer */
1462   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1463   A->transupdated = PETSC_TRUE;
1464   PetscFunctionReturn(0);
1465 }
1466 
1467 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1468 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1469 {
1470   PetscInt                              n = xx->map->n;
1471   const PetscScalar                     *barray;
1472   PetscScalar                           *xarray;
1473   thrust::device_ptr<const PetscScalar> bGPU;
1474   thrust::device_ptr<PetscScalar>       xGPU;
1475   cusparseStatus_t                      stat;
1476   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1477   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1478   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1479   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1480 
1481   PetscFunctionBegin;
1482   /* Analyze the matrix and create the transpose ... on the fly */
1483   if (!loTriFactorT && !upTriFactorT) {
1484     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1485     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1486     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1487   }
1488 
1489   /* Get the GPU pointers */
1490   PetscCall(VecCUDAGetArrayWrite(xx,&xarray));
1491   PetscCall(VecCUDAGetArrayRead(bb,&barray));
1492   xGPU = thrust::device_pointer_cast(xarray);
1493   bGPU = thrust::device_pointer_cast(barray);
1494 
1495   PetscCall(PetscLogGpuTimeBegin());
1496   /* First, reorder with the row permutation */
1497   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1498                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1499                xGPU);
1500 
1501   /* First, solve U */
1502   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1503                         upTriFactorT->csrMat->num_rows,
1504                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1505                         upTriFactorT->csrMat->num_entries,
1506                       #endif
1507                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1508                         upTriFactorT->csrMat->values->data().get(),
1509                         upTriFactorT->csrMat->row_offsets->data().get(),
1510                         upTriFactorT->csrMat->column_indices->data().get(),
1511                         upTriFactorT->solveInfo,
1512                         xarray,
1513                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1514                         tempGPU->data().get(),
1515                         upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);PetscCallCUSPARSE(stat);
1516                       #else
1517                         tempGPU->data().get());PetscCallCUSPARSE(stat);
1518                       #endif
1519 
1520   /* Then, solve L */
1521   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1522                         loTriFactorT->csrMat->num_rows,
1523                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1524                         loTriFactorT->csrMat->num_entries,
1525                       #endif
1526                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1527                         loTriFactorT->csrMat->values->data().get(),
1528                         loTriFactorT->csrMat->row_offsets->data().get(),
1529                         loTriFactorT->csrMat->column_indices->data().get(),
1530                         loTriFactorT->solveInfo,
1531                         tempGPU->data().get(),
1532                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1533                         xarray,
1534                         loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);PetscCallCUSPARSE(stat);
1535                       #else
1536                          xarray);PetscCallCUSPARSE(stat);
1537                       #endif
1538 
1539   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1540   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1541                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1542                tempGPU->begin());
1543 
1544   /* Copy the temporary to the full solution. */
1545   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU);
1546 
1547   /* restore */
1548   PetscCall(VecCUDARestoreArrayRead(bb,&barray));
1549   PetscCall(VecCUDARestoreArrayWrite(xx,&xarray));
1550   PetscCall(PetscLogGpuTimeEnd());
1551   PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n));
1552   PetscFunctionReturn(0);
1553 }
1554 
1555 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1556 {
1557   const PetscScalar                 *barray;
1558   PetscScalar                       *xarray;
1559   cusparseStatus_t                  stat;
1560   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1561   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1562   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1563   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1564 
1565   PetscFunctionBegin;
1566   /* Analyze the matrix and create the transpose ... on the fly */
1567   if (!loTriFactorT && !upTriFactorT) {
1568     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1569     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1570     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1571   }
1572 
1573   /* Get the GPU pointers */
1574   PetscCall(VecCUDAGetArrayWrite(xx,&xarray));
1575   PetscCall(VecCUDAGetArrayRead(bb,&barray));
1576 
1577   PetscCall(PetscLogGpuTimeBegin());
1578   /* First, solve U */
1579   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1580                         upTriFactorT->csrMat->num_rows,
1581                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1582                         upTriFactorT->csrMat->num_entries,
1583                       #endif
1584                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1585                         upTriFactorT->csrMat->values->data().get(),
1586                         upTriFactorT->csrMat->row_offsets->data().get(),
1587                         upTriFactorT->csrMat->column_indices->data().get(),
1588                         upTriFactorT->solveInfo,
1589                         barray,
1590                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1591                         tempGPU->data().get(),
1592                         upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);PetscCallCUSPARSE(stat);
1593                       #else
1594                         tempGPU->data().get());PetscCallCUSPARSE(stat);
1595                       #endif
1596 
1597   /* Then, solve L */
1598   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1599                         loTriFactorT->csrMat->num_rows,
1600                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1601                         loTriFactorT->csrMat->num_entries,
1602                       #endif
1603                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1604                         loTriFactorT->csrMat->values->data().get(),
1605                         loTriFactorT->csrMat->row_offsets->data().get(),
1606                         loTriFactorT->csrMat->column_indices->data().get(),
1607                         loTriFactorT->solveInfo,
1608                         tempGPU->data().get(),
1609                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1610                         xarray,
1611                         loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);PetscCallCUSPARSE(stat);
1612                       #else
1613                         xarray);PetscCallCUSPARSE(stat);
1614                       #endif
1615 
1616   /* restore */
1617   PetscCall(VecCUDARestoreArrayRead(bb,&barray));
1618   PetscCall(VecCUDARestoreArrayWrite(xx,&xarray));
1619   PetscCall(PetscLogGpuTimeEnd());
1620   PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n));
1621   PetscFunctionReturn(0);
1622 }
1623 
1624 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1625 {
1626   const PetscScalar                     *barray;
1627   PetscScalar                           *xarray;
1628   thrust::device_ptr<const PetscScalar> bGPU;
1629   thrust::device_ptr<PetscScalar>       xGPU;
1630   cusparseStatus_t                      stat;
1631   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1632   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1633   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1634   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1635 
1636   PetscFunctionBegin;
1637 
1638   /* Get the GPU pointers */
1639   PetscCall(VecCUDAGetArrayWrite(xx,&xarray));
1640   PetscCall(VecCUDAGetArrayRead(bb,&barray));
1641   xGPU = thrust::device_pointer_cast(xarray);
1642   bGPU = thrust::device_pointer_cast(barray);
1643 
1644   PetscCall(PetscLogGpuTimeBegin());
1645   /* First, reorder with the row permutation */
1646   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1647                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1648                tempGPU->begin());
1649 
1650   /* Next, solve L */
1651   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1652                         loTriFactor->csrMat->num_rows,
1653                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1654                         loTriFactor->csrMat->num_entries,
1655                       #endif
1656                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1657                         loTriFactor->csrMat->values->data().get(),
1658                         loTriFactor->csrMat->row_offsets->data().get(),
1659                         loTriFactor->csrMat->column_indices->data().get(),
1660                         loTriFactor->solveInfo,
1661                         tempGPU->data().get(),
1662                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1663                          xarray,
1664                          loTriFactor->solvePolicy, loTriFactor->solveBuffer);PetscCallCUSPARSE(stat);
1665                       #else
1666                          xarray);PetscCallCUSPARSE(stat);
1667                       #endif
1668 
1669   /* Then, solve U */
1670   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1671                         upTriFactor->csrMat->num_rows,
1672                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1673                         upTriFactor->csrMat->num_entries,
1674                       #endif
1675                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1676                         upTriFactor->csrMat->values->data().get(),
1677                         upTriFactor->csrMat->row_offsets->data().get(),
1678                         upTriFactor->csrMat->column_indices->data().get(),
1679                         upTriFactor->solveInfo,xarray,
1680                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1681                         tempGPU->data().get(),
1682                         upTriFactor->solvePolicy, upTriFactor->solveBuffer);PetscCallCUSPARSE(stat);
1683                       #else
1684                         tempGPU->data().get());PetscCallCUSPARSE(stat);
1685                       #endif
1686 
1687   /* Last, reorder with the column permutation */
1688   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1689                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
1690                xGPU);
1691 
1692   PetscCall(VecCUDARestoreArrayRead(bb,&barray));
1693   PetscCall(VecCUDARestoreArrayWrite(xx,&xarray));
1694   PetscCall(PetscLogGpuTimeEnd());
1695   PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n));
1696   PetscFunctionReturn(0);
1697 }
1698 
1699 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1700 {
1701   const PetscScalar                 *barray;
1702   PetscScalar                       *xarray;
1703   cusparseStatus_t                  stat;
1704   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1705   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1706   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1707   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1708 
1709   PetscFunctionBegin;
1710   /* Get the GPU pointers */
1711   PetscCall(VecCUDAGetArrayWrite(xx,&xarray));
1712   PetscCall(VecCUDAGetArrayRead(bb,&barray));
1713 
1714   PetscCall(PetscLogGpuTimeBegin());
1715   /* First, solve L */
1716   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1717                         loTriFactor->csrMat->num_rows,
1718                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1719                         loTriFactor->csrMat->num_entries,
1720                       #endif
1721                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1722                         loTriFactor->csrMat->values->data().get(),
1723                         loTriFactor->csrMat->row_offsets->data().get(),
1724                         loTriFactor->csrMat->column_indices->data().get(),
1725                         loTriFactor->solveInfo,
1726                         barray,
1727                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1728                         tempGPU->data().get(),
1729                         loTriFactor->solvePolicy,loTriFactor->solveBuffer);PetscCallCUSPARSE(stat);
1730                       #else
1731                         tempGPU->data().get());PetscCallCUSPARSE(stat);
1732                       #endif
1733 
1734   /* Next, solve U */
1735   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1736                         upTriFactor->csrMat->num_rows,
1737                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1738                         upTriFactor->csrMat->num_entries,
1739                       #endif
1740                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1741                         upTriFactor->csrMat->values->data().get(),
1742                         upTriFactor->csrMat->row_offsets->data().get(),
1743                         upTriFactor->csrMat->column_indices->data().get(),
1744                         upTriFactor->solveInfo,
1745                         tempGPU->data().get(),
1746                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1747                         xarray,
1748                         upTriFactor->solvePolicy, upTriFactor->solveBuffer);PetscCallCUSPARSE(stat);
1749                       #else
1750                         xarray);PetscCallCUSPARSE(stat);
1751                       #endif
1752 
1753   PetscCall(VecCUDARestoreArrayRead(bb,&barray));
1754   PetscCall(VecCUDARestoreArrayWrite(xx,&xarray));
1755   PetscCall(PetscLogGpuTimeEnd());
1756   PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n));
1757   PetscFunctionReturn(0);
1758 }
1759 
1760 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
1761 {
1762   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1763   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1764 
1765   PetscFunctionBegin;
1766   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
1767     CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat;
1768 
1769     PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0));
1770     PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost));
1771     PetscCallCUDA(WaitForCUDA());
1772     PetscCall(PetscLogGpuToCpu(a->nz*sizeof(PetscScalar)));
1773     PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0));
1774     A->offloadmask = PETSC_OFFLOAD_BOTH;
1775   }
1776   PetscFunctionReturn(0);
1777 }
1778 
1779 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1780 {
1781   PetscFunctionBegin;
1782   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
1783   *array = ((Mat_SeqAIJ*)A->data)->a;
1784   PetscFunctionReturn(0);
1785 }
1786 
1787 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1788 {
1789   PetscFunctionBegin;
1790   A->offloadmask = PETSC_OFFLOAD_CPU;
1791   *array         = NULL;
1792   PetscFunctionReturn(0);
1793 }
1794 
1795 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[])
1796 {
1797   PetscFunctionBegin;
1798   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
1799   *array = ((Mat_SeqAIJ*)A->data)->a;
1800   PetscFunctionReturn(0);
1801 }
1802 
1803 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[])
1804 {
1805   PetscFunctionBegin;
1806   *array = NULL;
1807   PetscFunctionReturn(0);
1808 }
1809 
1810 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1811 {
1812   PetscFunctionBegin;
1813   *array = ((Mat_SeqAIJ*)A->data)->a;
1814   PetscFunctionReturn(0);
1815 }
1816 
1817 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1818 {
1819   PetscFunctionBegin;
1820   A->offloadmask = PETSC_OFFLOAD_CPU;
1821   *array         = NULL;
1822   PetscFunctionReturn(0);
1823 }
1824 
1825 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1826 {
1827   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1828   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
1829   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1830   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
1831   cusparseStatus_t             stat;
1832   PetscBool                    both = PETSC_TRUE;
1833 
1834   PetscFunctionBegin;
1835   PetscCheck(!A->boundtocpu,PETSC_COMM_SELF,PETSC_ERR_GPU,"Cannot copy to GPU");
1836   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1837     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
1838       CsrMatrix *matrix;
1839       matrix = (CsrMatrix*)cusparsestruct->mat->mat;
1840 
1841       PetscCheckFalse(a->nz && !a->a,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR values");
1842       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0));
1843       matrix->values->assign(a->a, a->a+a->nz);
1844       PetscCallCUDA(WaitForCUDA());
1845       PetscCall(PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar)));
1846       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0));
1847       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE));
1848     } else {
1849       PetscInt nnz;
1850       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0));
1851       PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format));
1852       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE));
1853       delete cusparsestruct->workVector;
1854       delete cusparsestruct->rowoffsets_gpu;
1855       cusparsestruct->workVector = NULL;
1856       cusparsestruct->rowoffsets_gpu = NULL;
1857       try {
1858         if (a->compressedrow.use) {
1859           m    = a->compressedrow.nrows;
1860           ii   = a->compressedrow.i;
1861           ridx = a->compressedrow.rindex;
1862         } else {
1863           m    = A->rmap->n;
1864           ii   = a->i;
1865           ridx = NULL;
1866         }
1867         PetscCheckFalse(!ii,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR row data");
1868         if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; }
1869         else nnz = a->nz;
1870         PetscCheckFalse(nnz && !a->j,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR column data");
1871 
1872         /* create cusparse matrix */
1873         cusparsestruct->nrows = m;
1874         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1875         PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr));
1876         PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO));
1877         PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
1878 
1879         PetscCallCUDA(cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar)));
1880         PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar)));
1881         PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar)));
1882         PetscCallCUDA(cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
1883         PetscCallCUDA(cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice));
1884         PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
1885         PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE));
1886 
1887         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1888         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1889           /* set the matrix */
1890           CsrMatrix *mat= new CsrMatrix;
1891           mat->num_rows = m;
1892           mat->num_cols = A->cmap->n;
1893           mat->num_entries = nnz;
1894           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1895           mat->row_offsets->assign(ii, ii + m+1);
1896 
1897           mat->column_indices = new THRUSTINTARRAY32(nnz);
1898           mat->column_indices->assign(a->j, a->j+nnz);
1899 
1900           mat->values = new THRUSTARRAY(nnz);
1901           if (a->a) mat->values->assign(a->a, a->a+nnz);
1902 
1903           /* assign the pointer */
1904           matstruct->mat = mat;
1905          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1906           if (mat->num_rows) { /* cusparse errors on empty matrices! */
1907             stat = cusparseCreateCsr(&matstruct->matDescr,
1908                                     mat->num_rows, mat->num_cols, mat->num_entries,
1909                                     mat->row_offsets->data().get(), mat->column_indices->data().get(),
1910                                     mat->values->data().get(),
1911                                     CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1912                                     CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);PetscCallCUSPARSE(stat);
1913           }
1914          #endif
1915         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1916          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1917           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1918          #else
1919           CsrMatrix *mat= new CsrMatrix;
1920           mat->num_rows = m;
1921           mat->num_cols = A->cmap->n;
1922           mat->num_entries = nnz;
1923           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1924           mat->row_offsets->assign(ii, ii + m+1);
1925 
1926           mat->column_indices = new THRUSTINTARRAY32(nnz);
1927           mat->column_indices->assign(a->j, a->j+nnz);
1928 
1929           mat->values = new THRUSTARRAY(nnz);
1930           if (a->a) mat->values->assign(a->a, a->a+nnz);
1931 
1932           cusparseHybMat_t hybMat;
1933           PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
1934           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1935             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1936           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
1937               matstruct->descr, mat->values->data().get(),
1938               mat->row_offsets->data().get(),
1939               mat->column_indices->data().get(),
1940               hybMat, 0, partition);PetscCallCUSPARSE(stat);
1941           /* assign the pointer */
1942           matstruct->mat = hybMat;
1943 
1944           if (mat) {
1945             if (mat->values) delete (THRUSTARRAY*)mat->values;
1946             if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1947             if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1948             delete (CsrMatrix*)mat;
1949           }
1950          #endif
1951         }
1952 
1953         /* assign the compressed row indices */
1954         if (a->compressedrow.use) {
1955           cusparsestruct->workVector = new THRUSTARRAY(m);
1956           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
1957           matstruct->cprowIndices->assign(ridx,ridx+m);
1958           tmp = m;
1959         } else {
1960           cusparsestruct->workVector = NULL;
1961           matstruct->cprowIndices    = NULL;
1962           tmp = 0;
1963         }
1964         PetscCall(PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar)));
1965 
1966         /* assign the pointer */
1967         cusparsestruct->mat = matstruct;
1968       } catch(char *ex) {
1969         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1970       }
1971       PetscCallCUDA(WaitForCUDA());
1972       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0));
1973       cusparsestruct->nonzerostate = A->nonzerostate;
1974     }
1975     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
1976   }
1977   PetscFunctionReturn(0);
1978 }
1979 
1980 struct VecCUDAPlusEquals
1981 {
1982   template <typename Tuple>
1983   __host__ __device__
1984   void operator()(Tuple t)
1985   {
1986     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1987   }
1988 };
1989 
1990 struct VecCUDAEquals
1991 {
1992   template <typename Tuple>
1993   __host__ __device__
1994   void operator()(Tuple t)
1995   {
1996     thrust::get<1>(t) = thrust::get<0>(t);
1997   }
1998 };
1999 
2000 struct VecCUDAEqualsReverse
2001 {
2002   template <typename Tuple>
2003   __host__ __device__
2004   void operator()(Tuple t)
2005   {
2006     thrust::get<0>(t) = thrust::get<1>(t);
2007   }
2008 };
2009 
2010 struct MatMatCusparse {
2011   PetscBool             cisdense;
2012   PetscScalar           *Bt;
2013   Mat                   X;
2014   PetscBool             reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
2015   PetscLogDouble        flops;
2016   CsrMatrix             *Bcsr;
2017 
2018 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2019   cusparseSpMatDescr_t  matSpBDescr;
2020   PetscBool             initialized;   /* C = alpha op(A) op(B) + beta C */
2021   cusparseDnMatDescr_t  matBDescr;
2022   cusparseDnMatDescr_t  matCDescr;
2023   PetscInt              Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
2024  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2025   void                  *dBuffer4;
2026   void                  *dBuffer5;
2027  #endif
2028   size_t                mmBufferSize;
2029   void                  *mmBuffer;
2030   void                  *mmBuffer2; /* SpGEMM WorkEstimation buffer */
2031   cusparseSpGEMMDescr_t spgemmDesc;
2032 #endif
2033 };
2034 
2035 static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
2036 {
2037   MatMatCusparse *mmdata = (MatMatCusparse *)data;
2038 
2039   PetscFunctionBegin;
2040   PetscCallCUDA(cudaFree(mmdata->Bt));
2041   delete mmdata->Bcsr;
2042  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2043   if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr));
2044   if (mmdata->matBDescr)   PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
2045   if (mmdata->matCDescr)   PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
2046   if (mmdata->spgemmDesc)  PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc));
2047  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2048   if (mmdata->dBuffer4)  PetscCallCUDA(cudaFree(mmdata->dBuffer4));
2049   if (mmdata->dBuffer5)  PetscCallCUDA(cudaFree(mmdata->dBuffer5));
2050  #endif
2051   if (mmdata->mmBuffer)  PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2052   if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2));
2053  #endif
2054   PetscCall(MatDestroy(&mmdata->X));
2055   PetscCall(PetscFree(data));
2056   PetscFunctionReturn(0);
2057 }
2058 
2059 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
2060 
2061 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2062 {
2063   Mat_Product                  *product = C->product;
2064   Mat                          A,B;
2065   PetscInt                     m,n,blda,clda;
2066   PetscBool                    flg,biscuda;
2067   Mat_SeqAIJCUSPARSE           *cusp;
2068   cusparseStatus_t             stat;
2069   cusparseOperation_t          opA;
2070   const PetscScalar            *barray;
2071   PetscScalar                  *carray;
2072   MatMatCusparse               *mmdata;
2073   Mat_SeqAIJCUSPARSEMultStruct *mat;
2074   CsrMatrix                    *csrmat;
2075 
2076   PetscFunctionBegin;
2077   MatCheckProduct(C,1);
2078   PetscCheck(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
2079   mmdata = (MatMatCusparse*)product->data;
2080   A    = product->A;
2081   B    = product->B;
2082   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
2083   PetscCheck(flg,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2084   /* currently CopyToGpu does not copy if the matrix is bound to CPU
2085      Instead of silently accepting the wrong answer, I prefer to raise the error */
2086   PetscCheck(!A->boundtocpu,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2087   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2088   cusp   = (Mat_SeqAIJCUSPARSE*)A->spptr;
2089   switch (product->type) {
2090   case MATPRODUCT_AB:
2091   case MATPRODUCT_PtAP:
2092     mat = cusp->mat;
2093     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2094     m   = A->rmap->n;
2095     n   = B->cmap->n;
2096     break;
2097   case MATPRODUCT_AtB:
2098     if (!A->form_explicit_transpose) {
2099       mat = cusp->mat;
2100       opA = CUSPARSE_OPERATION_TRANSPOSE;
2101     } else {
2102       PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2103       mat  = cusp->matTranspose;
2104       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
2105     }
2106     m = A->cmap->n;
2107     n = B->cmap->n;
2108     break;
2109   case MATPRODUCT_ABt:
2110   case MATPRODUCT_RARt:
2111     mat = cusp->mat;
2112     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2113     m   = A->rmap->n;
2114     n   = B->rmap->n;
2115     break;
2116   default:
2117     SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2118   }
2119   PetscCheck(mat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing Mat_SeqAIJCUSPARSEMultStruct");
2120   csrmat = (CsrMatrix*)mat->mat;
2121   /* if the user passed a CPU matrix, copy the data to the GPU */
2122   PetscCall(PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda));
2123   if (!biscuda) PetscCall(MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B));
2124   PetscCall(MatDenseCUDAGetArrayRead(B,&barray));
2125 
2126   PetscCall(MatDenseGetLDA(B,&blda));
2127   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2128     PetscCall(MatDenseCUDAGetArrayWrite(mmdata->X,&carray));
2129     PetscCall(MatDenseGetLDA(mmdata->X,&clda));
2130   } else {
2131     PetscCall(MatDenseCUDAGetArrayWrite(C,&carray));
2132     PetscCall(MatDenseGetLDA(C,&clda));
2133   }
2134 
2135   PetscCall(PetscLogGpuTimeBegin());
2136  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2137   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2138   /* (re)allocate mmBuffer if not initialized or LDAs are different */
2139   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2140     size_t mmBufferSize;
2141     if (mmdata->initialized && mmdata->Blda != blda) {PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); mmdata->matBDescr = NULL;}
2142     if (!mmdata->matBDescr) {
2143       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL));
2144       mmdata->Blda = blda;
2145     }
2146 
2147     if (mmdata->initialized && mmdata->Clda != clda) {PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); mmdata->matCDescr = NULL;}
2148     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2149       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL));
2150       mmdata->Clda = clda;
2151     }
2152 
2153     if (!mat->matDescr) {
2154       stat = cusparseCreateCsr(&mat->matDescr,
2155                                csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
2156                                csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
2157                                csrmat->values->data().get(),
2158                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2159                                CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);PetscCallCUSPARSE(stat);
2160     }
2161     stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
2162                                    mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2163                                    mmdata->matCDescr,cusparse_scalartype,
2164                                    cusp->spmmAlg,&mmBufferSize);PetscCallCUSPARSE(stat);
2165     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2166       PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2167       PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer,mmBufferSize));
2168       mmdata->mmBufferSize = mmBufferSize;
2169     }
2170     mmdata->initialized = PETSC_TRUE;
2171   } else {
2172     /* to be safe, always update pointers of the mats */
2173     PetscCallCUSPARSE(cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get()));
2174     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray));
2175     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray));
2176   }
2177 
2178   /* do cusparseSpMM, which supports transpose on B */
2179   stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
2180                       mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2181                       mmdata->matCDescr,cusparse_scalartype,
2182                       cusp->spmmAlg,mmdata->mmBuffer);PetscCallCUSPARSE(stat);
2183  #else
2184   PetscInt k;
2185   /* cusparseXcsrmm does not support transpose on B */
2186   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2187     cublasHandle_t cublasv2handle;
2188     cublasStatus_t cerr;
2189 
2190     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
2191     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
2192                        B->cmap->n,B->rmap->n,
2193                        &PETSC_CUSPARSE_ONE ,barray,blda,
2194                        &PETSC_CUSPARSE_ZERO,barray,blda,
2195                        mmdata->Bt,B->cmap->n);PetscCallCUBLAS(cerr);
2196     blda = B->cmap->n;
2197     k    = B->cmap->n;
2198   } else {
2199     k    = B->rmap->n;
2200   }
2201 
2202   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2203   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
2204                            csrmat->num_entries,mat->alpha_one,mat->descr,
2205                            csrmat->values->data().get(),
2206                            csrmat->row_offsets->data().get(),
2207                            csrmat->column_indices->data().get(),
2208                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
2209                            carray,clda);PetscCallCUSPARSE(stat);
2210  #endif
2211   PetscCall(PetscLogGpuTimeEnd());
2212   PetscCall(PetscLogGpuFlops(n*2.0*csrmat->num_entries));
2213   PetscCall(MatDenseCUDARestoreArrayRead(B,&barray));
2214   if (product->type == MATPRODUCT_RARt) {
2215     PetscCall(MatDenseCUDARestoreArrayWrite(mmdata->X,&carray));
2216     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE));
2217   } else if (product->type == MATPRODUCT_PtAP) {
2218     PetscCall(MatDenseCUDARestoreArrayWrite(mmdata->X,&carray));
2219     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE));
2220   } else {
2221     PetscCall(MatDenseCUDARestoreArrayWrite(C,&carray));
2222   }
2223   if (mmdata->cisdense) {
2224     PetscCall(MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C));
2225   }
2226   if (!biscuda) {
2227     PetscCall(MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B));
2228   }
2229   PetscFunctionReturn(0);
2230 }
2231 
2232 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2233 {
2234   Mat_Product        *product = C->product;
2235   Mat                A,B;
2236   PetscInt           m,n;
2237   PetscBool          cisdense,flg;
2238   MatMatCusparse     *mmdata;
2239   Mat_SeqAIJCUSPARSE *cusp;
2240 
2241   PetscFunctionBegin;
2242   MatCheckProduct(C,1);
2243   PetscCheck(!C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
2244   A    = product->A;
2245   B    = product->B;
2246   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
2247   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2248   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2249   PetscCheckFalse(cusp->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2250   switch (product->type) {
2251   case MATPRODUCT_AB:
2252     m = A->rmap->n;
2253     n = B->cmap->n;
2254     break;
2255   case MATPRODUCT_AtB:
2256     m = A->cmap->n;
2257     n = B->cmap->n;
2258     break;
2259   case MATPRODUCT_ABt:
2260     m = A->rmap->n;
2261     n = B->rmap->n;
2262     break;
2263   case MATPRODUCT_PtAP:
2264     m = B->cmap->n;
2265     n = B->cmap->n;
2266     break;
2267   case MATPRODUCT_RARt:
2268     m = B->rmap->n;
2269     n = B->rmap->n;
2270     break;
2271   default:
2272     SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2273   }
2274   PetscCall(MatSetSizes(C,m,n,m,n));
2275   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2276   PetscCall(PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense));
2277   PetscCall(MatSetType(C,MATSEQDENSECUDA));
2278 
2279   /* product data */
2280   PetscCall(PetscNew(&mmdata));
2281   mmdata->cisdense = cisdense;
2282  #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
2283   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2284   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2285     PetscCallCUDA(cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar)));
2286   }
2287  #endif
2288   /* for these products we need intermediate storage */
2289   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2290     PetscCall(MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X));
2291     PetscCall(MatSetType(mmdata->X,MATSEQDENSECUDA));
2292     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2293       PetscCall(MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n));
2294     } else {
2295       PetscCall(MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n));
2296     }
2297   }
2298   C->product->data    = mmdata;
2299   C->product->destroy = MatDestroy_MatMatCusparse;
2300 
2301   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2302   PetscFunctionReturn(0);
2303 }
2304 
2305 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2306 {
2307   Mat_Product                  *product = C->product;
2308   Mat                          A,B;
2309   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2310   Mat_SeqAIJ                   *c = (Mat_SeqAIJ*)C->data;
2311   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2312   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2313   PetscBool                    flg;
2314   cusparseStatus_t             stat;
2315   MatProductType               ptype;
2316   MatMatCusparse               *mmdata;
2317 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2318   cusparseSpMatDescr_t         BmatSpDescr;
2319 #endif
2320   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2321 
2322   PetscFunctionBegin;
2323   MatCheckProduct(C,1);
2324   PetscCheck(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
2325   PetscCall(PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg));
2326   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for C of type %s",((PetscObject)C)->type_name);
2327   mmdata = (MatMatCusparse*)C->product->data;
2328   A = product->A;
2329   B = product->B;
2330   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2331     mmdata->reusesym = PETSC_FALSE;
2332     Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2333     PetscCheckFalse(Ccusp->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2334     Cmat = Ccusp->mat;
2335     PetscCheck(Cmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]);
2336     Ccsr = (CsrMatrix*)Cmat->mat;
2337     PetscCheck(Ccsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
2338     goto finalize;
2339   }
2340   if (!c->nz) goto finalize;
2341   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
2342   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2343   PetscCall(PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg));
2344   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
2345   PetscCheck(!A->boundtocpu,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2346   PetscCheck(!B->boundtocpu,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2347   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2348   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2349   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2350   PetscCheckFalse(Acusp->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2351   PetscCheckFalse(Bcusp->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2352   PetscCheckFalse(Ccusp->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2353   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2354   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
2355 
2356   ptype = product->type;
2357   if (A->symmetric && ptype == MATPRODUCT_AtB) {
2358     ptype = MATPRODUCT_AB;
2359     PetscCheck(product->symbolic_used_the_fact_A_is_symmetric,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that A is symmetric");
2360   }
2361   if (B->symmetric && ptype == MATPRODUCT_ABt) {
2362     ptype = MATPRODUCT_AB;
2363     PetscCheck(product->symbolic_used_the_fact_B_is_symmetric,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that B is symmetric");
2364   }
2365   switch (ptype) {
2366   case MATPRODUCT_AB:
2367     Amat = Acusp->mat;
2368     Bmat = Bcusp->mat;
2369     break;
2370   case MATPRODUCT_AtB:
2371     Amat = Acusp->matTranspose;
2372     Bmat = Bcusp->mat;
2373     break;
2374   case MATPRODUCT_ABt:
2375     Amat = Acusp->mat;
2376     Bmat = Bcusp->matTranspose;
2377     break;
2378   default:
2379     SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2380   }
2381   Cmat = Ccusp->mat;
2382   PetscCheck(Amat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2383   PetscCheck(Bmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2384   PetscCheck(Cmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[ptype]);
2385   Acsr = (CsrMatrix*)Amat->mat;
2386   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */
2387   Ccsr = (CsrMatrix*)Cmat->mat;
2388   PetscCheck(Acsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
2389   PetscCheck(Bcsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
2390   PetscCheck(Ccsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
2391   PetscCall(PetscLogGpuTimeBegin());
2392 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2393   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2394   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2395   #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2396     stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB,
2397                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2398                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2399                                mmdata->spgemmDesc);PetscCallCUSPARSE(stat);
2400   #else
2401     stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2402                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2403                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2404                                mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);PetscCallCUSPARSE(stat);
2405     stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB,
2406                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2407                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);PetscCallCUSPARSE(stat);
2408   #endif
2409 #else
2410   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB,
2411                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2412                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2413                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2414                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());PetscCallCUSPARSE(stat);
2415 #endif
2416   PetscCall(PetscLogGpuFlops(mmdata->flops));
2417   PetscCallCUDA(WaitForCUDA());
2418   PetscCall(PetscLogGpuTimeEnd());
2419   C->offloadmask = PETSC_OFFLOAD_GPU;
2420 finalize:
2421   /* shorter version of MatAssemblyEnd_SeqAIJ */
2422   PetscCall(PetscInfo(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz));
2423   PetscCall(PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n"));
2424   PetscCall(PetscInfo(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax));
2425   c->reallocs         = 0;
2426   C->info.mallocs    += 0;
2427   C->info.nz_unneeded = 0;
2428   C->assembled = C->was_assembled = PETSC_TRUE;
2429   C->num_ass++;
2430   PetscFunctionReturn(0);
2431 }
2432 
2433 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2434 {
2435   Mat_Product                  *product = C->product;
2436   Mat                          A,B;
2437   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2438   Mat_SeqAIJ                   *a,*b,*c;
2439   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2440   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2441   PetscInt                     i,j,m,n,k;
2442   PetscBool                    flg;
2443   cusparseStatus_t             stat;
2444   MatProductType               ptype;
2445   MatMatCusparse               *mmdata;
2446   PetscLogDouble               flops;
2447   PetscBool                    biscompressed,ciscompressed;
2448 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2449   int64_t                      C_num_rows1, C_num_cols1, C_nnz1;
2450   cusparseSpMatDescr_t         BmatSpDescr;
2451 #else
2452   int                          cnz;
2453 #endif
2454   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2455 
2456   PetscFunctionBegin;
2457   MatCheckProduct(C,1);
2458   PetscCheck(!C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
2459   A    = product->A;
2460   B    = product->B;
2461   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
2462   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2463   PetscCall(PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg));
2464   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
2465   a = (Mat_SeqAIJ*)A->data;
2466   b = (Mat_SeqAIJ*)B->data;
2467   /* product data */
2468   PetscCall(PetscNew(&mmdata));
2469   C->product->data    = mmdata;
2470   C->product->destroy = MatDestroy_MatMatCusparse;
2471 
2472   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2473   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
2474   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */
2475   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2476   PetscCheckFalse(Acusp->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2477   PetscCheckFalse(Bcusp->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2478 
2479   ptype = product->type;
2480   if (A->symmetric && ptype == MATPRODUCT_AtB) {
2481     ptype = MATPRODUCT_AB;
2482     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
2483   }
2484   if (B->symmetric && ptype == MATPRODUCT_ABt) {
2485     ptype = MATPRODUCT_AB;
2486     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
2487   }
2488   biscompressed = PETSC_FALSE;
2489   ciscompressed = PETSC_FALSE;
2490   switch (ptype) {
2491   case MATPRODUCT_AB:
2492     m = A->rmap->n;
2493     n = B->cmap->n;
2494     k = A->cmap->n;
2495     Amat = Acusp->mat;
2496     Bmat = Bcusp->mat;
2497     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2498     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2499     break;
2500   case MATPRODUCT_AtB:
2501     m = A->cmap->n;
2502     n = B->cmap->n;
2503     k = A->rmap->n;
2504     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2505     Amat = Acusp->matTranspose;
2506     Bmat = Bcusp->mat;
2507     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2508     break;
2509   case MATPRODUCT_ABt:
2510     m = A->rmap->n;
2511     n = B->rmap->n;
2512     k = A->cmap->n;
2513     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
2514     Amat = Acusp->mat;
2515     Bmat = Bcusp->matTranspose;
2516     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2517     break;
2518   default:
2519     SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2520   }
2521 
2522   /* create cusparse matrix */
2523   PetscCall(MatSetSizes(C,m,n,m,n));
2524   PetscCall(MatSetType(C,MATSEQAIJCUSPARSE));
2525   c     = (Mat_SeqAIJ*)C->data;
2526   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2527   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
2528   Ccsr  = new CsrMatrix;
2529 
2530   c->compressedrow.use = ciscompressed;
2531   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2532     c->compressedrow.nrows = a->compressedrow.nrows;
2533     PetscCall(PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex));
2534     PetscCall(PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows));
2535     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
2536     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2537     Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows);
2538   } else {
2539     c->compressedrow.nrows  = 0;
2540     c->compressedrow.i      = NULL;
2541     c->compressedrow.rindex = NULL;
2542     Ccusp->workVector       = NULL;
2543     Cmat->cprowIndices      = NULL;
2544   }
2545   Ccusp->nrows    = ciscompressed ? c->compressedrow.nrows : m;
2546   Ccusp->mat      = Cmat;
2547   Ccusp->mat->mat = Ccsr;
2548   Ccsr->num_rows    = Ccusp->nrows;
2549   Ccsr->num_cols    = n;
2550   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1);
2551   PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
2552   PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
2553   PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
2554   PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar)));
2555   PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar)));
2556   PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar)));
2557   PetscCallCUDA(cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
2558   PetscCallCUDA(cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice));
2559   PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
2560   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
2561     thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0);
2562     c->nz = 0;
2563     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2564     Ccsr->values = new THRUSTARRAY(c->nz);
2565     goto finalizesym;
2566   }
2567 
2568   PetscCheck(Amat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2569   PetscCheck(Bmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2570   Acsr = (CsrMatrix*)Amat->mat;
2571   if (!biscompressed) {
2572     Bcsr = (CsrMatrix*)Bmat->mat;
2573 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2574     BmatSpDescr = Bmat->matDescr;
2575 #endif
2576   } else { /* we need to use row offsets for the full matrix */
2577     CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat;
2578     Bcsr = new CsrMatrix;
2579     Bcsr->num_rows       = B->rmap->n;
2580     Bcsr->num_cols       = cBcsr->num_cols;
2581     Bcsr->num_entries    = cBcsr->num_entries;
2582     Bcsr->column_indices = cBcsr->column_indices;
2583     Bcsr->values         = cBcsr->values;
2584     if (!Bcusp->rowoffsets_gpu) {
2585       Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
2586       Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
2587       PetscCall(PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt)));
2588     }
2589     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2590     mmdata->Bcsr = Bcsr;
2591 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2592     if (Bcsr->num_rows && Bcsr->num_cols) {
2593       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries,
2594                                Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2595                                Bcsr->values->data().get(),
2596                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2597                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat);
2598     }
2599     BmatSpDescr = mmdata->matSpBDescr;
2600 #endif
2601   }
2602   PetscCheck(Acsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
2603   PetscCheck(Bcsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
2604   /* precompute flops count */
2605   if (ptype == MATPRODUCT_AB) {
2606     for (i=0, flops = 0; i<A->rmap->n; i++) {
2607       const PetscInt st = a->i[i];
2608       const PetscInt en = a->i[i+1];
2609       for (j=st; j<en; j++) {
2610         const PetscInt brow = a->j[j];
2611         flops += 2.*(b->i[brow+1] - b->i[brow]);
2612       }
2613     }
2614   } else if (ptype == MATPRODUCT_AtB) {
2615     for (i=0, flops = 0; i<A->rmap->n; i++) {
2616       const PetscInt anzi = a->i[i+1] - a->i[i];
2617       const PetscInt bnzi = b->i[i+1] - b->i[i];
2618       flops += (2.*anzi)*bnzi;
2619     }
2620   } else { /* TODO */
2621     flops = 0.;
2622   }
2623 
2624   mmdata->flops = flops;
2625   PetscCall(PetscLogGpuTimeBegin());
2626 
2627 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2628   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2629   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0,
2630                           NULL, NULL, NULL,
2631                           CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2632                           CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat);
2633   PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc));
2634  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2635  {
2636   /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
2637      We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
2638   */
2639   void*  dBuffer1 = NULL;
2640   void*  dBuffer2 = NULL;
2641   void*  dBuffer3 = NULL;
2642   /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
2643   size_t bufferSize1 = 0;
2644   size_t bufferSize2 = 0;
2645   size_t bufferSize3 = 0;
2646   size_t bufferSize4 = 0;
2647   size_t bufferSize5 = 0;
2648 
2649   /*----------------------------------------------------------------------*/
2650   /* ask bufferSize1 bytes for external memory */
2651   stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2652                                             CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2653                                             &bufferSize1, NULL);PetscCallCUSPARSE(stat);
2654   PetscCallCUDA(cudaMalloc((void**) &dBuffer1, bufferSize1));
2655   /* inspect the matrices A and B to understand the memory requirement for the next step */
2656   stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2657                                             CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2658                                             &bufferSize1, dBuffer1);PetscCallCUSPARSE(stat);
2659 
2660   /*----------------------------------------------------------------------*/
2661   stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2662                                  CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2663                                  &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);PetscCallCUSPARSE(stat);
2664   PetscCallCUDA(cudaMalloc((void**) &dBuffer2, bufferSize2));
2665   PetscCallCUDA(cudaMalloc((void**) &dBuffer3, bufferSize3));
2666   PetscCallCUDA(cudaMalloc((void**) &mmdata->dBuffer4, bufferSize4));
2667   stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2668                                  CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2669                                  &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);PetscCallCUSPARSE(stat);
2670   PetscCallCUDA(cudaFree(dBuffer1));
2671   PetscCallCUDA(cudaFree(dBuffer2));
2672 
2673   /*----------------------------------------------------------------------*/
2674   /* get matrix C non-zero entries C_nnz1 */
2675   PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
2676   c->nz = (PetscInt) C_nnz1;
2677   /* allocate matrix C */
2678   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2679   Ccsr->values         = new THRUSTARRAY(c->nz);PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2680   /* update matC with the new pointers */
2681   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2682                                 Ccsr->values->data().get());PetscCallCUSPARSE(stat);
2683 
2684   /*----------------------------------------------------------------------*/
2685   stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2686                                   CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2687                                   &bufferSize5, NULL);PetscCallCUSPARSE(stat);
2688   PetscCallCUDA(cudaMalloc((void**) &mmdata->dBuffer5, bufferSize5));
2689   stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2690                                   CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2691                                   &bufferSize5, mmdata->dBuffer5);PetscCallCUSPARSE(stat);
2692   PetscCallCUDA(cudaFree(dBuffer3));
2693   stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB,
2694                                      Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2695                                      cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2696                                      mmdata->spgemmDesc);PetscCallCUSPARSE(stat);
2697   PetscCall(PetscInfo(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufferSize4/1024,bufferSize5/1024));
2698  }
2699  #else
2700   size_t bufSize2;
2701   /* ask bufferSize bytes for external memory */
2702   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB,
2703                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2704                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2705                                        mmdata->spgemmDesc, &bufSize2, NULL);PetscCallCUSPARSE(stat);
2706   PetscCallCUDA(cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2));
2707   /* inspect the matrices A and B to understand the memory requirement for the next step */
2708   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB,
2709                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2710                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2711                                        mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);PetscCallCUSPARSE(stat);
2712   /* ask bufferSize again bytes for external memory */
2713   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2714                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2715                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2716                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);PetscCallCUSPARSE(stat);
2717   /* The CUSPARSE documentation is not clear, nor the API
2718      We need both buffers to perform the operations properly!
2719      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2720      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2721      is stored in the descriptor! What a messy API... */
2722   PetscCallCUDA(cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize));
2723   /* compute the intermediate product of A * B */
2724   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2725                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2726                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2727                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);PetscCallCUSPARSE(stat);
2728   /* get matrix C non-zero entries C_nnz1 */
2729   PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
2730   c->nz = (PetscInt) C_nnz1;
2731   PetscCall(PetscInfo(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024));
2732   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2733   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2734   Ccsr->values = new THRUSTARRAY(c->nz);
2735   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2736   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2737                                 Ccsr->values->data().get());PetscCallCUSPARSE(stat);
2738   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB,
2739                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2740                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);PetscCallCUSPARSE(stat);
2741  #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2742 #else
2743   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST));
2744   stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB,
2745                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2746                              Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2747                              Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2748                              Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);PetscCallCUSPARSE(stat);
2749   c->nz = cnz;
2750   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2751   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2752   Ccsr->values = new THRUSTARRAY(c->nz);
2753   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2754 
2755   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2756   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
2757      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
2758      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
2759   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB,
2760                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2761                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2762                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2763                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());PetscCallCUSPARSE(stat);
2764 #endif
2765   PetscCall(PetscLogGpuFlops(mmdata->flops));
2766   PetscCall(PetscLogGpuTimeEnd());
2767 finalizesym:
2768   c->singlemalloc = PETSC_FALSE;
2769   c->free_a       = PETSC_TRUE;
2770   c->free_ij      = PETSC_TRUE;
2771   PetscCall(PetscMalloc1(m+1,&c->i));
2772   PetscCall(PetscMalloc1(c->nz,&c->j));
2773   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
2774     PetscInt *d_i = c->i;
2775     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
2776     THRUSTINTARRAY jj(Ccsr->column_indices->size());
2777     ii   = *Ccsr->row_offsets;
2778     jj   = *Ccsr->column_indices;
2779     if (ciscompressed) d_i = c->compressedrow.i;
2780     PetscCallCUDA(cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
2781     PetscCallCUDA(cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
2782   } else {
2783     PetscInt *d_i = c->i;
2784     if (ciscompressed) d_i = c->compressedrow.i;
2785     PetscCallCUDA(cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
2786     PetscCallCUDA(cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
2787   }
2788   if (ciscompressed) { /* need to expand host row offsets */
2789     PetscInt r = 0;
2790     c->i[0] = 0;
2791     for (k = 0; k < c->compressedrow.nrows; k++) {
2792       const PetscInt next = c->compressedrow.rindex[k];
2793       const PetscInt old = c->compressedrow.i[k];
2794       for (; r < next; r++) c->i[r+1] = old;
2795     }
2796     for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows];
2797   }
2798   PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt)));
2799   PetscCall(PetscMalloc1(m,&c->ilen));
2800   PetscCall(PetscMalloc1(m,&c->imax));
2801   c->maxnz = c->nz;
2802   c->nonzerorowcnt = 0;
2803   c->rmax = 0;
2804   for (k = 0; k < m; k++) {
2805     const PetscInt nn = c->i[k+1] - c->i[k];
2806     c->ilen[k] = c->imax[k] = nn;
2807     c->nonzerorowcnt += (PetscInt)!!nn;
2808     c->rmax = PetscMax(c->rmax,nn);
2809   }
2810   PetscCall(MatMarkDiagonal_SeqAIJ(C));
2811   PetscCall(PetscMalloc1(c->nz,&c->a));
2812   Ccsr->num_entries = c->nz;
2813 
2814   C->nonzerostate++;
2815   PetscCall(PetscLayoutSetUp(C->rmap));
2816   PetscCall(PetscLayoutSetUp(C->cmap));
2817   Ccusp->nonzerostate = C->nonzerostate;
2818   C->offloadmask   = PETSC_OFFLOAD_UNALLOCATED;
2819   C->preallocated  = PETSC_TRUE;
2820   C->assembled     = PETSC_FALSE;
2821   C->was_assembled = PETSC_FALSE;
2822   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
2823     mmdata->reusesym = PETSC_TRUE;
2824     C->offloadmask   = PETSC_OFFLOAD_GPU;
2825   }
2826   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2827   PetscFunctionReturn(0);
2828 }
2829 
2830 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
2831 
2832 /* handles sparse or dense B */
2833 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
2834 {
2835   Mat_Product    *product = mat->product;
2836   PetscErrorCode ierr;
2837   PetscBool      isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE;
2838 
2839   PetscFunctionBegin;
2840   MatCheckProduct(mat,1);
2841   PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense));
2842   if (!product->A->boundtocpu && !product->B->boundtocpu) {
2843     PetscCall(PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp));
2844   }
2845   if (product->type == MATPRODUCT_ABC) {
2846     Ciscusp = PETSC_FALSE;
2847     if (!product->C->boundtocpu) {
2848       PetscCall(PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp));
2849     }
2850   }
2851   if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
2852     PetscBool usecpu = PETSC_FALSE;
2853     switch (product->type) {
2854     case MATPRODUCT_AB:
2855       if (product->api_user) {
2856         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMult","Mat");PetscCall(ierr);
2857         PetscCall(PetscOptionsBool("-matmatmult_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL));
2858         ierr = PetscOptionsEnd();PetscCall(ierr);
2859       } else {
2860         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AB","Mat");PetscCall(ierr);
2861         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL));
2862         ierr = PetscOptionsEnd();PetscCall(ierr);
2863       }
2864       break;
2865     case MATPRODUCT_AtB:
2866       if (product->api_user) {
2867         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatTransposeMatMult","Mat");PetscCall(ierr);
2868         PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL));
2869         ierr = PetscOptionsEnd();PetscCall(ierr);
2870       } else {
2871         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AtB","Mat");PetscCall(ierr);
2872         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL));
2873         ierr = PetscOptionsEnd();PetscCall(ierr);
2874       }
2875       break;
2876     case MATPRODUCT_PtAP:
2877       if (product->api_user) {
2878         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatPtAP","Mat");PetscCall(ierr);
2879         PetscCall(PetscOptionsBool("-matptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL));
2880         ierr = PetscOptionsEnd();PetscCall(ierr);
2881       } else {
2882         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_PtAP","Mat");PetscCall(ierr);
2883         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL));
2884         ierr = PetscOptionsEnd();PetscCall(ierr);
2885       }
2886       break;
2887     case MATPRODUCT_RARt:
2888       if (product->api_user) {
2889         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatRARt","Mat");PetscCall(ierr);
2890         PetscCall(PetscOptionsBool("-matrart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL));
2891         ierr = PetscOptionsEnd();PetscCall(ierr);
2892       } else {
2893         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_RARt","Mat");PetscCall(ierr);
2894         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL));
2895         ierr = PetscOptionsEnd();PetscCall(ierr);
2896       }
2897       break;
2898     case MATPRODUCT_ABC:
2899       if (product->api_user) {
2900         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMatMult","Mat");PetscCall(ierr);
2901         PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL));
2902         ierr = PetscOptionsEnd();PetscCall(ierr);
2903       } else {
2904         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_ABC","Mat");PetscCall(ierr);
2905         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL));
2906         ierr = PetscOptionsEnd();PetscCall(ierr);
2907       }
2908       break;
2909     default:
2910       break;
2911     }
2912     if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
2913   }
2914   /* dispatch */
2915   if (isdense) {
2916     switch (product->type) {
2917     case MATPRODUCT_AB:
2918     case MATPRODUCT_AtB:
2919     case MATPRODUCT_ABt:
2920     case MATPRODUCT_PtAP:
2921     case MATPRODUCT_RARt:
2922      if (product->A->boundtocpu) {
2923         PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat));
2924       } else {
2925         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2926       }
2927       break;
2928     case MATPRODUCT_ABC:
2929       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2930       break;
2931     default:
2932       break;
2933     }
2934   } else if (Biscusp && Ciscusp) {
2935     switch (product->type) {
2936     case MATPRODUCT_AB:
2937     case MATPRODUCT_AtB:
2938     case MATPRODUCT_ABt:
2939       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2940       break;
2941     case MATPRODUCT_PtAP:
2942     case MATPRODUCT_RARt:
2943     case MATPRODUCT_ABC:
2944       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2945       break;
2946     default:
2947       break;
2948     }
2949   } else { /* fallback for AIJ */
2950     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
2951   }
2952   PetscFunctionReturn(0);
2953 }
2954 
2955 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2956 {
2957   PetscFunctionBegin;
2958   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE));
2959   PetscFunctionReturn(0);
2960 }
2961 
2962 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
2963 {
2964   PetscFunctionBegin;
2965   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE));
2966   PetscFunctionReturn(0);
2967 }
2968 
2969 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2970 {
2971   PetscFunctionBegin;
2972   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE));
2973   PetscFunctionReturn(0);
2974 }
2975 
2976 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2977 {
2978   PetscFunctionBegin;
2979   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE));
2980   PetscFunctionReturn(0);
2981 }
2982 
2983 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2984 {
2985   PetscFunctionBegin;
2986   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE));
2987   PetscFunctionReturn(0);
2988 }
2989 
2990 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y)
2991 {
2992   int i = blockIdx.x*blockDim.x + threadIdx.x;
2993   if (i < n) y[idx[i]] += x[i];
2994 }
2995 
2996 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
2997 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
2998 {
2999   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
3000   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
3001   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
3002   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
3003   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
3004   PetscBool                    compressed;
3005 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3006   PetscInt                     nx,ny;
3007 #endif
3008 
3009   PetscFunctionBegin;
3010   PetscCheckFalse(herm && !trans,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Hermitian and not transpose not supported");
3011   if (!a->nz) {
3012     if (!yy) PetscCall(VecSet_SeqCUDA(zz,0));
3013     else PetscCall(VecCopy_SeqCUDA(yy,zz));
3014     PetscFunctionReturn(0);
3015   }
3016   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
3017   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3018   if (!trans) {
3019     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
3020     PetscCheck(matstruct,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
3021   } else {
3022     if (herm || !A->form_explicit_transpose) {
3023       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
3024       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
3025     } else {
3026       if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3027       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
3028     }
3029   }
3030   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3031   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
3032 
3033   try {
3034     PetscCall(VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray));
3035     if (yy == zz) PetscCall(VecCUDAGetArray(zz,&zarray)); /* read & write zz, so need to get uptodate zarray on GPU */
3036     else PetscCall(VecCUDAGetArrayWrite(zz,&zarray)); /* write zz, so no need to init zarray on GPU */
3037 
3038     PetscCall(PetscLogGpuTimeBegin());
3039     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3040       /* z = A x + beta y.
3041          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3042          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3043       */
3044       xptr = xarray;
3045       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
3046       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3047      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3048       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3049           allocated to accommodate different uses. So we get the length info directly from mat.
3050        */
3051       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3052         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3053         nx = mat->num_cols;
3054         ny = mat->num_rows;
3055       }
3056      #endif
3057     } else {
3058       /* z = A^T x + beta y
3059          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3060          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3061        */
3062       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
3063       dptr = zarray;
3064       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3065       if (compressed) { /* Scatter x to work vector */
3066         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3067         thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
3068                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3069                          VecCUDAEqualsReverse());
3070       }
3071      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3072       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3073         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3074         nx = mat->num_rows;
3075         ny = mat->num_cols;
3076       }
3077      #endif
3078     }
3079 
3080     /* csr_spmv does y = alpha op(A) x + beta y */
3081     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3082      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3083       PetscCheck(opA >= 0 && opA <= 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
3084       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
3085         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype));
3086         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype));
3087         PetscCallCUSPARSE(cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
3088                                                matstruct->matDescr,
3089                                                matstruct->cuSpMV[opA].vecXDescr, beta,
3090                                                matstruct->cuSpMV[opA].vecYDescr,
3091                                                cusparse_scalartype,
3092                                                cusparsestruct->spmvAlg,
3093                                                &matstruct->cuSpMV[opA].spmvBufferSize));
3094         PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize));
3095 
3096         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3097       } else {
3098         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
3099         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr));
3100         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr));
3101       }
3102 
3103       PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA,
3104                                   matstruct->alpha_one,
3105                                   matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */
3106                                   matstruct->cuSpMV[opA].vecXDescr,
3107                                   beta,
3108                                   matstruct->cuSpMV[opA].vecYDescr,
3109                                   cusparse_scalartype,
3110                                   cusparsestruct->spmvAlg,
3111                                   matstruct->cuSpMV[opA].spmvBuffer));
3112      #else
3113       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3114       PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA,
3115                                        mat->num_rows, mat->num_cols,
3116                                        mat->num_entries, matstruct->alpha_one, matstruct->descr,
3117                                        mat->values->data().get(), mat->row_offsets->data().get(),
3118                                        mat->column_indices->data().get(), xptr, beta,
3119                                        dptr));
3120      #endif
3121     } else {
3122       if (cusparsestruct->nrows) {
3123        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3124         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3125        #else
3126         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
3127         PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA,
3128                                          matstruct->alpha_one, matstruct->descr, hybMat,
3129                                          xptr, beta,
3130                                          dptr));
3131        #endif
3132       }
3133     }
3134     PetscCall(PetscLogGpuTimeEnd());
3135 
3136     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3137       if (yy) { /* MatMultAdd: zz = A*xx + yy */
3138         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3139           PetscCall(VecCopy_SeqCUDA(yy,zz)); /* zz = yy */
3140         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3141           PetscCall(VecAXPY_SeqCUDA(zz,1.0,yy)); /* zz += yy */
3142         }
3143       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3144         PetscCall(VecSet_SeqCUDA(zz,0));
3145       }
3146 
3147       /* ScatterAdd the result from work vector into the full vector when A is compressed */
3148       if (compressed) {
3149         PetscCall(PetscLogGpuTimeBegin());
3150         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred)
3151            and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
3152            prevent that. So I just add a ScatterAdd kernel.
3153          */
3154        #if 0
3155         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
3156         thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
3157                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
3158                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3159                          VecCUDAPlusEquals());
3160        #else
3161         PetscInt n = matstruct->cprowIndices->size();
3162         ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray);
3163        #endif
3164         PetscCall(PetscLogGpuTimeEnd());
3165       }
3166     } else {
3167       if (yy && yy != zz) {
3168         PetscCall(VecAXPY_SeqCUDA(zz,1.0,yy)); /* zz += yy */
3169       }
3170     }
3171     PetscCall(VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray));
3172     if (yy == zz) PetscCall(VecCUDARestoreArray(zz,&zarray));
3173     else PetscCall(VecCUDARestoreArrayWrite(zz,&zarray));
3174   } catch(char *ex) {
3175     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
3176   }
3177   if (yy) {
3178     PetscCall(PetscLogGpuFlops(2.0*a->nz));
3179   } else {
3180     PetscCall(PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt));
3181   }
3182   PetscFunctionReturn(0);
3183 }
3184 
3185 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
3186 {
3187   PetscFunctionBegin;
3188   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE));
3189   PetscFunctionReturn(0);
3190 }
3191 
3192 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
3193 {
3194   PetscObjectState   onnz = A->nonzerostate;
3195   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3196 
3197   PetscFunctionBegin;
3198   PetscCall(MatAssemblyEnd_SeqAIJ(A,mode));
3199   if (onnz != A->nonzerostate && cusp->deviceMat) {
3200 
3201     PetscCall(PetscInfo(A,"Destroy device mat since nonzerostate changed\n"));
3202     PetscCallCUDA(cudaFree(cusp->deviceMat));
3203     cusp->deviceMat = NULL;
3204   }
3205   PetscFunctionReturn(0);
3206 }
3207 
3208 /* --------------------------------------------------------------------------------*/
3209 /*@
3210    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
3211    (the default parallel PETSc format). This matrix will ultimately pushed down
3212    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
3213    assembly performance the user should preallocate the matrix storage by setting
3214    the parameter nz (or the array nnz).  By setting these parameters accurately,
3215    performance during matrix assembly can be increased by more than a factor of 50.
3216 
3217    Collective
3218 
3219    Input Parameters:
3220 +  comm - MPI communicator, set to PETSC_COMM_SELF
3221 .  m - number of rows
3222 .  n - number of columns
3223 .  nz - number of nonzeros per row (same for all rows)
3224 -  nnz - array containing the number of nonzeros in the various rows
3225          (possibly different for each row) or NULL
3226 
3227    Output Parameter:
3228 .  A - the matrix
3229 
3230    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
3231    MatXXXXSetPreallocation() paradgm instead of this routine directly.
3232    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
3233 
3234    Notes:
3235    If nnz is given then nz is ignored
3236 
3237    The AIJ format (also called the Yale sparse matrix format or
3238    compressed row storage), is fully compatible with standard Fortran 77
3239    storage.  That is, the stored row and column indices can begin at
3240    either one (as in Fortran) or zero.  See the users' manual for details.
3241 
3242    Specify the preallocated storage with either nz or nnz (not both).
3243    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
3244    allocation.  For large problems you MUST preallocate memory or you
3245    will get TERRIBLE performance, see the users' manual chapter on matrices.
3246 
3247    By default, this format uses inodes (identical nodes) when possible, to
3248    improve numerical efficiency of matrix-vector products and solves. We
3249    search for consecutive rows with the same nonzero structure, thereby
3250    reusing matrix information to achieve increased efficiency.
3251 
3252    Level: intermediate
3253 
3254 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
3255 @*/
3256 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
3257 {
3258   PetscFunctionBegin;
3259   PetscCall(MatCreate(comm,A));
3260   PetscCall(MatSetSizes(*A,m,n,m,n));
3261   PetscCall(MatSetType(*A,MATSEQAIJCUSPARSE));
3262   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz));
3263   PetscFunctionReturn(0);
3264 }
3265 
3266 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3267 {
3268   PetscFunctionBegin;
3269   if (A->factortype == MAT_FACTOR_NONE) {
3270     PetscCall(MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr));
3271   } else {
3272     PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr));
3273   }
3274   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL));
3275   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL));
3276   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetUseCPUSolve_C",NULL));
3277   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL));
3278   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL));
3279   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL));
3280   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL));
3281   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL));
3282   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL));
3283   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaijcusparse_hypre_C",NULL));
3284   PetscCall(MatDestroy_SeqAIJ(A));
3285   PetscFunctionReturn(0);
3286 }
3287 
3288 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
3289 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
3290 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
3291 {
3292   PetscFunctionBegin;
3293   PetscCall(MatDuplicate_SeqAIJ(A,cpvalues,B));
3294   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B));
3295   PetscFunctionReturn(0);
3296 }
3297 
3298 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str)
3299 {
3300   Mat_SeqAIJ         *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
3301   Mat_SeqAIJCUSPARSE *cy;
3302   Mat_SeqAIJCUSPARSE *cx;
3303   PetscScalar        *ay;
3304   const PetscScalar  *ax;
3305   CsrMatrix          *csry,*csrx;
3306 
3307   PetscFunctionBegin;
3308   cy = (Mat_SeqAIJCUSPARSE*)Y->spptr;
3309   cx = (Mat_SeqAIJCUSPARSE*)X->spptr;
3310   if (X->ops->axpy != Y->ops->axpy) {
3311     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE));
3312     PetscCall(MatAXPY_SeqAIJ(Y,a,X,str));
3313     PetscFunctionReturn(0);
3314   }
3315   /* if we are here, it means both matrices are bound to GPU */
3316   PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y));
3317   PetscCall(MatSeqAIJCUSPARSECopyToGPU(X));
3318   PetscCheck(cy->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)Y),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
3319   PetscCheck(cx->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)X),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
3320   csry = (CsrMatrix*)cy->mat->mat;
3321   csrx = (CsrMatrix*)cx->mat->mat;
3322   /* see if we can turn this into a cublas axpy */
3323   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3324     bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin());
3325     if (eq) {
3326       eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin());
3327     }
3328     if (eq) str = SAME_NONZERO_PATTERN;
3329   }
3330   /* spgeam is buggy with one column */
3331   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3332 
3333   if (str == SUBSET_NONZERO_PATTERN) {
3334     PetscScalar b = 1.0;
3335 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3336     size_t      bufferSize;
3337     void        *buffer;
3338 #endif
3339 
3340     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X,&ax));
3341     PetscCall(MatSeqAIJCUSPARSEGetArray(Y,&ay));
3342     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST));
3343 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3344     PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n,
3345                                                   &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3346                                                   &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3347                                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize));
3348     PetscCallCUDA(cudaMalloc(&buffer,bufferSize));
3349     PetscCall(PetscLogGpuTimeBegin());
3350     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3351                                        &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3352                                        &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3353                                        cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer));
3354     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3355     PetscCall(PetscLogGpuTimeEnd());
3356     PetscCallCUDA(cudaFree(buffer));
3357 #else
3358     PetscCall(PetscLogGpuTimeBegin());
3359     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3360                                        &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3361                                        &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3362                                        cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get()));
3363     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3364     PetscCall(PetscLogGpuTimeEnd());
3365 #endif
3366     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE));
3367     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X,&ax));
3368     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y,&ay));
3369     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3370   } else if (str == SAME_NONZERO_PATTERN) {
3371     cublasHandle_t cublasv2handle;
3372     PetscBLASInt   one = 1, bnz = 1;
3373 
3374     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X,&ax));
3375     PetscCall(MatSeqAIJCUSPARSEGetArray(Y,&ay));
3376     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
3377     PetscCall(PetscBLASIntCast(x->nz,&bnz));
3378     PetscCall(PetscLogGpuTimeBegin());
3379     PetscCallCUBLAS(cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one));
3380     PetscCall(PetscLogGpuFlops(2.0*bnz));
3381     PetscCall(PetscLogGpuTimeEnd());
3382     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X,&ax));
3383     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y,&ay));
3384     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3385   } else {
3386     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE));
3387     PetscCall(MatAXPY_SeqAIJ(Y,a,X,str));
3388   }
3389   PetscFunctionReturn(0);
3390 }
3391 
3392 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a)
3393 {
3394   Mat_SeqAIJ     *y = (Mat_SeqAIJ*)Y->data;
3395   PetscScalar    *ay;
3396   cublasHandle_t cublasv2handle;
3397   PetscBLASInt   one = 1, bnz = 1;
3398 
3399   PetscFunctionBegin;
3400   PetscCall(MatSeqAIJCUSPARSEGetArray(Y,&ay));
3401   PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
3402   PetscCall(PetscBLASIntCast(y->nz,&bnz));
3403   PetscCall(PetscLogGpuTimeBegin());
3404   PetscCallCUBLAS(cublasXscal(cublasv2handle,bnz,&a,ay,one));
3405   PetscCall(PetscLogGpuFlops(bnz));
3406   PetscCall(PetscLogGpuTimeEnd());
3407   PetscCall(MatSeqAIJCUSPARSERestoreArray(Y,&ay));
3408   PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3409   PetscFunctionReturn(0);
3410 }
3411 
3412 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3413 {
3414   PetscBool      both = PETSC_FALSE;
3415   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3416 
3417   PetscFunctionBegin;
3418   if (A->factortype == MAT_FACTOR_NONE) {
3419     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
3420     if (spptr->mat) {
3421       CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat;
3422       if (matrix->values) {
3423         both = PETSC_TRUE;
3424         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3425       }
3426     }
3427     if (spptr->matTranspose) {
3428       CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat;
3429       if (matrix->values) {
3430         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3431       }
3432     }
3433   }
3434   PetscCall(PetscArrayzero(a->a,a->i[A->rmap->n]));
3435   PetscCall(MatSeqAIJInvalidateDiagonal(A));
3436   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3437   else A->offloadmask = PETSC_OFFLOAD_CPU;
3438   PetscFunctionReturn(0);
3439 }
3440 
3441 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
3442 {
3443   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3444 
3445   PetscFunctionBegin;
3446   if (A->factortype != MAT_FACTOR_NONE) {
3447     A->boundtocpu = flg;
3448     PetscFunctionReturn(0);
3449   }
3450   if (flg) {
3451     PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
3452 
3453     A->ops->scale                     = MatScale_SeqAIJ;
3454     A->ops->axpy                      = MatAXPY_SeqAIJ;
3455     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3456     A->ops->mult                      = MatMult_SeqAIJ;
3457     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3458     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3459     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3460     A->ops->multhermitiantranspose    = NULL;
3461     A->ops->multhermitiantransposeadd = NULL;
3462     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
3463     PetscCall(PetscMemzero(a->ops,sizeof(Mat_SeqAIJOps)));
3464     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL));
3465     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL));
3466     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL));
3467     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL));
3468     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL));
3469     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ));
3470     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL));
3471   } else {
3472     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
3473     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3474     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3475     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
3476     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
3477     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
3478     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
3479     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3480     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3481     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
3482     a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJCUSPARSE;
3483     a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJCUSPARSE;
3484     a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE;
3485     a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE;
3486     a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE;
3487     a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE;
3488     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE));
3489     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE));
3490     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE));
3491     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE));
3492     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE));
3493     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE));
3494   }
3495   A->boundtocpu = flg;
3496   if (flg && a->inode.size) {
3497     a->inode.use = PETSC_TRUE;
3498   } else {
3499     a->inode.use = PETSC_FALSE;
3500   }
3501   PetscFunctionReturn(0);
3502 }
3503 
3504 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
3505 {
3506   Mat              B;
3507 
3508   PetscFunctionBegin;
3509   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */
3510   if (reuse == MAT_INITIAL_MATRIX) {
3511     PetscCall(MatDuplicate(A,MAT_COPY_VALUES,newmat));
3512   } else if (reuse == MAT_REUSE_MATRIX) {
3513     PetscCall(MatCopy(A,*newmat,SAME_NONZERO_PATTERN));
3514   }
3515   B = *newmat;
3516 
3517   PetscCall(PetscFree(B->defaultvectype));
3518   PetscCall(PetscStrallocpy(VECCUDA,&B->defaultvectype));
3519 
3520   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
3521     if (B->factortype == MAT_FACTOR_NONE) {
3522       Mat_SeqAIJCUSPARSE *spptr;
3523       PetscCall(PetscNew(&spptr));
3524       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
3525       PetscCallCUSPARSE(cusparseSetStream(spptr->handle,PetscDefaultCudaStream));
3526       spptr->format     = MAT_CUSPARSE_CSR;
3527      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3528      #if PETSC_PKG_CUDA_VERSION_GE(11,2,0)
3529       spptr->spmvAlg    = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
3530      #else
3531       spptr->spmvAlg    = CUSPARSE_CSRMV_ALG1;    /* default, since we only support csr */
3532      #endif
3533       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
3534       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
3535      #endif
3536       B->spptr = spptr;
3537     } else {
3538       Mat_SeqAIJCUSPARSETriFactors *spptr;
3539 
3540       PetscCall(PetscNew(&spptr));
3541       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
3542       PetscCallCUSPARSE(cusparseSetStream(spptr->handle,PetscDefaultCudaStream));
3543       B->spptr = spptr;
3544     }
3545     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
3546   }
3547   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
3548   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
3549   B->ops->setoption      = MatSetOption_SeqAIJCUSPARSE;
3550   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
3551   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
3552   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
3553 
3554   PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE));
3555   PetscCall(PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE));
3556   PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE));
3557 #if defined(PETSC_HAVE_HYPRE)
3558   PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijcusparse_hypre_C",MatConvert_AIJ_HYPRE));
3559 #endif
3560   PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetUseCPUSolve_C",MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE));
3561   PetscFunctionReturn(0);
3562 }
3563 
3564 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
3565 {
3566   PetscFunctionBegin;
3567   PetscCall(MatCreate_SeqAIJ(B));
3568   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B));
3569   PetscFunctionReturn(0);
3570 }
3571 
3572 /*MC
3573    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
3574 
3575    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
3576    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
3577    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
3578 
3579    Options Database Keys:
3580 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
3581 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3582 -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3583 +  -mat_cusparse_use_cpu_solve - Do MatSolve on CPU
3584 
3585   Level: beginner
3586 
3587 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
3588 M*/
3589 
3590 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*);
3591 
3592 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
3593 {
3594   PetscFunctionBegin;
3595   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSEBAND,MATSEQAIJ,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band));
3596   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse));
3597   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse));
3598   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse));
3599   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse));
3600 
3601   PetscFunctionReturn(0);
3602 }
3603 
3604 static PetscErrorCode MatResetPreallocationCOO_SeqAIJCUSPARSE(Mat mat)
3605 {
3606   Mat_SeqAIJCUSPARSE* cusp = (Mat_SeqAIJCUSPARSE*)mat->spptr;
3607 
3608   PetscFunctionBegin;
3609   if (!cusp) PetscFunctionReturn(0);
3610   delete cusp->cooPerm;
3611   delete cusp->cooPerm_a;
3612   cusp->cooPerm = NULL;
3613   cusp->cooPerm_a = NULL;
3614   if (cusp->use_extended_coo) {
3615     PetscCallCUDA(cudaFree(cusp->jmap_d));
3616     PetscCallCUDA(cudaFree(cusp->perm_d));
3617   }
3618   cusp->use_extended_coo = PETSC_FALSE;
3619   PetscFunctionReturn(0);
3620 }
3621 
3622 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
3623 {
3624   PetscFunctionBegin;
3625   if (*cusparsestruct) {
3626     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format));
3627     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format));
3628     delete (*cusparsestruct)->workVector;
3629     delete (*cusparsestruct)->rowoffsets_gpu;
3630     delete (*cusparsestruct)->cooPerm;
3631     delete (*cusparsestruct)->cooPerm_a;
3632     delete (*cusparsestruct)->csr2csc_i;
3633     if ((*cusparsestruct)->handle) PetscCallCUSPARSE(cusparseDestroy((*cusparsestruct)->handle));
3634     if ((*cusparsestruct)->jmap_d) PetscCallCUDA(cudaFree((*cusparsestruct)->jmap_d));
3635     if ((*cusparsestruct)->perm_d) PetscCallCUDA(cudaFree((*cusparsestruct)->perm_d));
3636     PetscCall(PetscFree(*cusparsestruct));
3637   }
3638   PetscFunctionReturn(0);
3639 }
3640 
3641 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
3642 {
3643   PetscFunctionBegin;
3644   if (*mat) {
3645     delete (*mat)->values;
3646     delete (*mat)->column_indices;
3647     delete (*mat)->row_offsets;
3648     delete *mat;
3649     *mat = 0;
3650   }
3651   PetscFunctionReturn(0);
3652 }
3653 
3654 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
3655 {
3656   PetscFunctionBegin;
3657   if (*trifactor) {
3658     if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr));
3659     if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparse_destroy_analysis_info((*trifactor)->solveInfo));
3660     PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat));
3661     if ((*trifactor)->solveBuffer)   PetscCallCUDA(cudaFree((*trifactor)->solveBuffer));
3662     if ((*trifactor)->AA_h)   PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h));
3663    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3664     if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer));
3665    #endif
3666     PetscCall(PetscFree(*trifactor));
3667   }
3668   PetscFunctionReturn(0);
3669 }
3670 
3671 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
3672 {
3673   CsrMatrix        *mat;
3674 
3675   PetscFunctionBegin;
3676   if (*matstruct) {
3677     if ((*matstruct)->mat) {
3678       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
3679        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3680         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3681        #else
3682         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
3683         PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat));
3684        #endif
3685       } else {
3686         mat = (CsrMatrix*)(*matstruct)->mat;
3687         CsrMatrix_Destroy(&mat);
3688       }
3689     }
3690     if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr));
3691     delete (*matstruct)->cprowIndices;
3692     if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one));
3693     if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero));
3694     if ((*matstruct)->beta_one)  PetscCallCUDA(cudaFree((*matstruct)->beta_one));
3695 
3696    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3697     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
3698     if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr));
3699     for (int i=0; i<3; i++) {
3700       if (mdata->cuSpMV[i].initialized) {
3701         PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer));
3702         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr));
3703         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr));
3704       }
3705     }
3706    #endif
3707     delete *matstruct;
3708     *matstruct = NULL;
3709   }
3710   PetscFunctionReturn(0);
3711 }
3712 
3713 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p* trifactors)
3714 {
3715   PetscFunctionBegin;
3716   if (*trifactors) {
3717     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr));
3718     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr));
3719     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose));
3720     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose));
3721     delete (*trifactors)->rpermIndices;
3722     delete (*trifactors)->cpermIndices;
3723     delete (*trifactors)->workVector;
3724     (*trifactors)->rpermIndices = NULL;
3725     (*trifactors)->cpermIndices = NULL;
3726     (*trifactors)->workVector = NULL;
3727     if ((*trifactors)->a_band_d)   PetscCallCUDA(cudaFree((*trifactors)->a_band_d));
3728     if ((*trifactors)->i_band_d)   PetscCallCUDA(cudaFree((*trifactors)->i_band_d));
3729     (*trifactors)->init_dev_prop = PETSC_FALSE;
3730   }
3731   PetscFunctionReturn(0);
3732 }
3733 
3734 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
3735 {
3736   cusparseHandle_t handle;
3737 
3738   PetscFunctionBegin;
3739   if (*trifactors) {
3740     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors));
3741     if (handle = (*trifactors)->handle) {
3742       PetscCallCUSPARSE(cusparseDestroy(handle));
3743     }
3744     PetscCall(PetscFree(*trifactors));
3745   }
3746   PetscFunctionReturn(0);
3747 }
3748 
3749 struct IJCompare
3750 {
3751   __host__ __device__
3752   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3753   {
3754     if (t1.get<0>() < t2.get<0>()) return true;
3755     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3756     return false;
3757   }
3758 };
3759 
3760 struct IJEqual
3761 {
3762   __host__ __device__
3763   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3764   {
3765     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
3766     return true;
3767   }
3768 };
3769 
3770 struct IJDiff
3771 {
3772   __host__ __device__
3773   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3774   {
3775     return t1 == t2 ? 0 : 1;
3776   }
3777 };
3778 
3779 struct IJSum
3780 {
3781   __host__ __device__
3782   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3783   {
3784     return t1||t2;
3785   }
3786 };
3787 
3788 #include <thrust/iterator/discard_iterator.h>
3789 /* Associated with MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic() */
3790 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode)
3791 {
3792   Mat_SeqAIJCUSPARSE                    *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3793   Mat_SeqAIJ                            *a = (Mat_SeqAIJ*)A->data;
3794   THRUSTARRAY                           *cooPerm_v = NULL;
3795   thrust::device_ptr<const PetscScalar> d_v;
3796   CsrMatrix                             *matrix;
3797   PetscInt                              n;
3798 
3799   PetscFunctionBegin;
3800   PetscCheck(cusp,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct");
3801   PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix");
3802   if (!cusp->cooPerm) {
3803     PetscCall(MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY));
3804     PetscCall(MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY));
3805     PetscFunctionReturn(0);
3806   }
3807   matrix = (CsrMatrix*)cusp->mat->mat;
3808   PetscCheck(matrix->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3809   if (!v) {
3810     if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3811     goto finalize;
3812   }
3813   n = cusp->cooPerm->size();
3814   if (isCudaMem(v)) {
3815     d_v = thrust::device_pointer_cast(v);
3816   } else {
3817     cooPerm_v = new THRUSTARRAY(n);
3818     cooPerm_v->assign(v,v+n);
3819     d_v = cooPerm_v->data();
3820     PetscCall(PetscLogCpuToGpu(n*sizeof(PetscScalar)));
3821   }
3822   PetscCall(PetscLogGpuTimeBegin());
3823   if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
3824     if (cusp->cooPerm_a) { /* there are repeated entries in d_v[], and we need to add these them */
3825       THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
3826       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3827       /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output)
3828         cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[].
3829         cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero.
3830       */
3831       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3832       thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>());
3833       delete cooPerm_w;
3834     } else {
3835       /* all nonzeros in d_v[] are unique entries */
3836       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3837                                                                 matrix->values->begin()));
3838       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3839                                                                 matrix->values->end()));
3840       thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]]  */
3841     }
3842   } else {
3843     if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
3844       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3845       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3846     } else {
3847       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3848                                                                 matrix->values->begin()));
3849       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3850                                                                 matrix->values->end()));
3851       thrust::for_each(zibit,zieit,VecCUDAEquals());
3852     }
3853   }
3854   PetscCall(PetscLogGpuTimeEnd());
3855 finalize:
3856   delete cooPerm_v;
3857   A->offloadmask = PETSC_OFFLOAD_GPU;
3858   PetscCall(PetscObjectStateIncrease((PetscObject)A));
3859   /* shorter version of MatAssemblyEnd_SeqAIJ */
3860   PetscCall(PetscInfo(A,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",A->rmap->n,A->cmap->n,a->nz));
3861   PetscCall(PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n"));
3862   PetscCall(PetscInfo(A,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",a->rmax));
3863   a->reallocs         = 0;
3864   A->info.mallocs    += 0;
3865   A->info.nz_unneeded = 0;
3866   A->assembled = A->was_assembled = PETSC_TRUE;
3867   A->num_ass++;
3868   PetscFunctionReturn(0);
3869 }
3870 
3871 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
3872 {
3873   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3874 
3875   PetscFunctionBegin;
3876   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3877   if (!cusp) PetscFunctionReturn(0);
3878   if (destroy) {
3879     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format));
3880     delete cusp->csr2csc_i;
3881     cusp->csr2csc_i = NULL;
3882   }
3883   A->transupdated = PETSC_FALSE;
3884   PetscFunctionReturn(0);
3885 }
3886 
3887 #include <thrust/binary_search.h>
3888 /* 'Basic' means it only works when coo_i[] and coo_j[] do not contain negative indices */
3889 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(Mat A, PetscCount n, const PetscInt coo_i[], const PetscInt coo_j[])
3890 {
3891   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3892   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
3893   PetscInt           cooPerm_n, nzr = 0;
3894 
3895   PetscFunctionBegin;
3896   PetscCall(PetscLayoutSetUp(A->rmap));
3897   PetscCall(PetscLayoutSetUp(A->cmap));
3898   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
3899   if (n != cooPerm_n) {
3900     delete cusp->cooPerm;
3901     delete cusp->cooPerm_a;
3902     cusp->cooPerm = NULL;
3903     cusp->cooPerm_a = NULL;
3904   }
3905   if (n) {
3906     THRUSTINTARRAY d_i(n);
3907     THRUSTINTARRAY d_j(n);
3908     THRUSTINTARRAY ii(A->rmap->n);
3909 
3910     if (!cusp->cooPerm)   { cusp->cooPerm   = new THRUSTINTARRAY(n); }
3911     if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); }
3912 
3913     PetscCall(PetscLogCpuToGpu(2.*n*sizeof(PetscInt)));
3914     d_i.assign(coo_i,coo_i+n);
3915     d_j.assign(coo_j,coo_j+n);
3916 
3917     /* Ex.
3918       n = 6
3919       coo_i = [3,3,1,4,1,4]
3920       coo_j = [3,2,2,5,2,6]
3921     */
3922     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin()));
3923     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end()));
3924 
3925     PetscCall(PetscLogGpuTimeBegin());
3926     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
3927     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */
3928     *cusp->cooPerm_a = d_i; /* copy the sorted array */
3929     THRUSTINTARRAY w = d_j;
3930 
3931     /*
3932       d_i     = [1,1,3,3,4,4]
3933       d_j     = [2,2,2,3,5,6]
3934       cooPerm = [2,4,1,0,3,5]
3935     */
3936     auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */
3937 
3938     /*
3939       d_i     = [1,3,3,4,4,x]
3940                             ^ekey
3941       d_j     = [2,2,3,5,6,x]
3942                            ^nekye
3943     */
3944     if (nekey == ekey) { /* all entries are unique */
3945       delete cusp->cooPerm_a;
3946       cusp->cooPerm_a = NULL;
3947     } else { /* Stefano: I couldn't come up with a more elegant algorithm */
3948       /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */
3949       adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/
3950       adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff());                                              /* w:         [2,2,2,3,5,6] => [2,0,0,1,1,1]*/
3951       (*cusp->cooPerm_a)[0] = 0; /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */
3952       w[0] = 0;
3953       thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); /* cooPerm_a =          [0,0,1,1,1,1]*/
3954       thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/
3955     }
3956     thrust::counting_iterator<PetscInt> search_begin(0);
3957     thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */
3958                         search_begin, search_begin + A->rmap->n,  /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */
3959                         ii.begin()); /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */
3960     PetscCall(PetscLogGpuTimeEnd());
3961 
3962     PetscCall(MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i));
3963     a->singlemalloc = PETSC_FALSE;
3964     a->free_a       = PETSC_TRUE;
3965     a->free_ij      = PETSC_TRUE;
3966     PetscCall(PetscMalloc1(A->rmap->n+1,&a->i));
3967     a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */
3968     PetscCallCUDA(cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost));
3969     a->nz = a->maxnz = a->i[A->rmap->n];
3970     a->rmax = 0;
3971     PetscCall(PetscMalloc1(a->nz,&a->a));
3972     PetscCall(PetscMalloc1(a->nz,&a->j));
3973     PetscCallCUDA(cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost));
3974     if (!a->ilen) PetscCall(PetscMalloc1(A->rmap->n,&a->ilen));
3975     if (!a->imax) PetscCall(PetscMalloc1(A->rmap->n,&a->imax));
3976     for (PetscInt i = 0; i < A->rmap->n; i++) {
3977       const PetscInt nnzr = a->i[i+1] - a->i[i];
3978       nzr += (PetscInt)!!(nnzr);
3979       a->ilen[i] = a->imax[i] = nnzr;
3980       a->rmax = PetscMax(a->rmax,nnzr);
3981     }
3982     a->nonzerorowcnt = nzr;
3983     A->preallocated = PETSC_TRUE;
3984     PetscCall(PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt)));
3985     PetscCall(MatMarkDiagonal_SeqAIJ(A));
3986   } else {
3987     PetscCall(MatSeqAIJSetPreallocation(A,0,NULL));
3988   }
3989   PetscCall(MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE));
3990 
3991   /* We want to allocate the CUSPARSE struct for matvec now.
3992      The code is so convoluted now that I prefer to copy zeros */
3993   PetscCall(PetscArrayzero(a->a,a->nz));
3994   PetscCall(MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6));
3995   A->offloadmask = PETSC_OFFLOAD_CPU;
3996   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3997   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE));
3998   PetscFunctionReturn(0);
3999 }
4000 
4001 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[])
4002 {
4003   Mat_SeqAIJ         *seq;
4004   Mat_SeqAIJCUSPARSE *dev;
4005   PetscBool          coo_basic = PETSC_TRUE;
4006   PetscMemType       mtype = PETSC_MEMTYPE_DEVICE;
4007 
4008   PetscFunctionBegin;
4009   PetscCall(MatResetPreallocationCOO_SeqAIJ(mat));
4010   PetscCall(MatResetPreallocationCOO_SeqAIJCUSPARSE(mat));
4011   if (coo_i) {
4012     PetscCall(PetscGetMemType(coo_i,&mtype));
4013     if (PetscMemTypeHost(mtype)) {
4014       for (PetscCount k=0; k<coo_n; k++) {
4015         if (coo_i[k] < 0 || coo_j[k] < 0) {coo_basic = PETSC_FALSE; break;}
4016       }
4017     }
4018   }
4019 
4020   if (coo_basic) { /* i,j are on device or do not contain negative indices */
4021     PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(mat,coo_n,coo_i,coo_j));
4022   } else {
4023     PetscCall(MatSetPreallocationCOO_SeqAIJ(mat,coo_n,coo_i,coo_j));
4024     mat->offloadmask = PETSC_OFFLOAD_CPU;
4025     PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat));
4026     seq  = static_cast<Mat_SeqAIJ*>(mat->data);
4027     dev  = static_cast<Mat_SeqAIJCUSPARSE*>(mat->spptr);
4028     PetscCallCUDA(cudaMalloc((void**)&dev->jmap_d,(seq->nz+1)*sizeof(PetscCount)));
4029     PetscCallCUDA(cudaMemcpy(dev->jmap_d,seq->jmap,(seq->nz+1)*sizeof(PetscCount),cudaMemcpyHostToDevice));
4030     PetscCallCUDA(cudaMalloc((void**)&dev->perm_d,seq->Atot*sizeof(PetscCount)));
4031     PetscCallCUDA(cudaMemcpy(dev->perm_d,seq->perm,seq->Atot*sizeof(PetscCount),cudaMemcpyHostToDevice));
4032     dev->use_extended_coo = PETSC_TRUE;
4033   }
4034   PetscFunctionReturn(0);
4035 }
4036 
4037 __global__ void MatAddCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount jmap[],const PetscCount perm[],InsertMode imode,PetscScalar a[])
4038 {
4039   PetscCount        i = blockIdx.x*blockDim.x + threadIdx.x;
4040   const PetscCount  grid_size = gridDim.x * blockDim.x;
4041   for (; i<nnz; i+= grid_size) {
4042     PetscScalar sum = 0.0;
4043     for (PetscCount k=jmap[i]; k<jmap[i+1]; k++) sum += kv[perm[k]];
4044     a[i] = (imode == INSERT_VALUES? 0.0 : a[i]) + sum;
4045   }
4046 }
4047 
4048 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
4049 {
4050   Mat_SeqAIJ          *seq = (Mat_SeqAIJ*)A->data;
4051   Mat_SeqAIJCUSPARSE  *dev = (Mat_SeqAIJCUSPARSE*)A->spptr;
4052   PetscCount          Annz = seq->nz;
4053   PetscMemType        memtype;
4054   const PetscScalar   *v1 = v;
4055   PetscScalar         *Aa;
4056 
4057   PetscFunctionBegin;
4058   if (dev->use_extended_coo) {
4059     PetscCall(PetscGetMemType(v,&memtype));
4060     if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
4061       PetscCallCUDA(cudaMalloc((void**)&v1,seq->coo_n*sizeof(PetscScalar)));
4062       PetscCallCUDA(cudaMemcpy((void*)v1,v,seq->coo_n*sizeof(PetscScalar),cudaMemcpyHostToDevice));
4063     }
4064 
4065     if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A,&Aa));
4066     else PetscCall(MatSeqAIJCUSPARSEGetArray(A,&Aa));
4067 
4068     if (Annz) {
4069       MatAddCOOValues<<<(Annz+255)/256,256>>>(v1,Annz,dev->jmap_d,dev->perm_d,imode,Aa);
4070       PetscCallCUDA(cudaPeekAtLastError());
4071     }
4072 
4073     if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A,&Aa));
4074     else PetscCall(MatSeqAIJCUSPARSERestoreArray(A,&Aa));
4075 
4076     if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void*)v1));
4077   } else {
4078     PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(A,v,imode));
4079   }
4080   PetscFunctionReturn(0);
4081 }
4082 
4083 /*@C
4084     MatSeqAIJCUSPARSEGetIJ - returns the device row storage i and j indices for MATSEQAIJCUSPARSE matrices.
4085 
4086    Not collective
4087 
4088     Input Parameters:
4089 +   A - the matrix
4090 -   compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form
4091 
4092     Output Parameters:
4093 +   ia - the CSR row pointers
4094 -   ja - the CSR column indices
4095 
4096     Level: developer
4097 
4098     Notes:
4099       When compressed is true, the CSR structure does not contain empty rows
4100 
4101 .seealso: MatSeqAIJCUSPARSERestoreIJ(), MatSeqAIJCUSPARSEGetArrayRead()
4102 @*/
4103 PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int** i, const int **j)
4104 {
4105   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4106   CsrMatrix          *csr;
4107   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
4108 
4109   PetscFunctionBegin;
4110   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4111   if (!i || !j) PetscFunctionReturn(0);
4112   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4113   PetscCheckFalse(cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4114   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4115   PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4116   csr = (CsrMatrix*)cusp->mat->mat;
4117   if (i) {
4118     if (!compressed && a->compressedrow.use) { /* need full row offset */
4119       if (!cusp->rowoffsets_gpu) {
4120         cusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
4121         cusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
4122         PetscCall(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt)));
4123       }
4124       *i = cusp->rowoffsets_gpu->data().get();
4125     } else *i = csr->row_offsets->data().get();
4126   }
4127   if (j) *j = csr->column_indices->data().get();
4128   PetscFunctionReturn(0);
4129 }
4130 
4131 /*@C
4132     MatSeqAIJCUSPARSERestoreIJ - restore the device row storage i and j indices obtained with MatSeqAIJCUSPARSEGetIJ()
4133 
4134    Not collective
4135 
4136     Input Parameters:
4137 +   A - the matrix
4138 -   compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form
4139 
4140     Output Parameters:
4141 +   ia - the CSR row pointers
4142 -   ja - the CSR column indices
4143 
4144     Level: developer
4145 
4146 .seealso: MatSeqAIJCUSPARSEGetIJ()
4147 @*/
4148 PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int** i, const int **j)
4149 {
4150   PetscFunctionBegin;
4151   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4152   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4153   if (i) *i = NULL;
4154   if (j) *j = NULL;
4155   PetscFunctionReturn(0);
4156 }
4157 
4158 /*@C
4159    MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
4160 
4161    Not Collective
4162 
4163    Input Parameter:
4164 .   A - a MATSEQAIJCUSPARSE matrix
4165 
4166    Output Parameter:
4167 .   a - pointer to the device data
4168 
4169    Level: developer
4170 
4171    Notes: may trigger host-device copies if up-to-date matrix data is on host
4172 
4173 .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArrayRead()
4174 @*/
4175 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a)
4176 {
4177   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4178   CsrMatrix          *csr;
4179 
4180   PetscFunctionBegin;
4181   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4182   PetscValidPointer(a,2);
4183   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4184   PetscCheckFalse(cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4185   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4186   PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4187   csr = (CsrMatrix*)cusp->mat->mat;
4188   PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4189   *a = csr->values->data().get();
4190   PetscFunctionReturn(0);
4191 }
4192 
4193 /*@C
4194    MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from MatSeqAIJCUSPARSEGetArrayRead()
4195 
4196    Not Collective
4197 
4198    Input Parameter:
4199 .   A - a MATSEQAIJCUSPARSE matrix
4200 
4201    Output Parameter:
4202 .   a - pointer to the device data
4203 
4204    Level: developer
4205 
4206 .seealso: MatSeqAIJCUSPARSEGetArrayRead()
4207 @*/
4208 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a)
4209 {
4210   PetscFunctionBegin;
4211   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4212   PetscValidPointer(a,2);
4213   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4214   *a = NULL;
4215   PetscFunctionReturn(0);
4216 }
4217 
4218 /*@C
4219    MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
4220 
4221    Not Collective
4222 
4223    Input Parameter:
4224 .   A - a MATSEQAIJCUSPARSE matrix
4225 
4226    Output Parameter:
4227 .   a - pointer to the device data
4228 
4229    Level: developer
4230 
4231    Notes: may trigger host-device copies if up-to-date matrix data is on host
4232 
4233 .seealso: MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArray()
4234 @*/
4235 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a)
4236 {
4237   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4238   CsrMatrix          *csr;
4239 
4240   PetscFunctionBegin;
4241   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4242   PetscValidPointer(a,2);
4243   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4244   PetscCheckFalse(cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4245   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4246   PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4247   csr = (CsrMatrix*)cusp->mat->mat;
4248   PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4249   *a = csr->values->data().get();
4250   A->offloadmask = PETSC_OFFLOAD_GPU;
4251   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE));
4252   PetscFunctionReturn(0);
4253 }
4254 /*@C
4255    MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from MatSeqAIJCUSPARSEGetArray()
4256 
4257    Not Collective
4258 
4259    Input Parameter:
4260 .   A - a MATSEQAIJCUSPARSE matrix
4261 
4262    Output Parameter:
4263 .   a - pointer to the device data
4264 
4265    Level: developer
4266 
4267 .seealso: MatSeqAIJCUSPARSEGetArray()
4268 @*/
4269 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a)
4270 {
4271   PetscFunctionBegin;
4272   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4273   PetscValidPointer(a,2);
4274   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4275   PetscCall(MatSeqAIJInvalidateDiagonal(A));
4276   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4277   *a = NULL;
4278   PetscFunctionReturn(0);
4279 }
4280 
4281 /*@C
4282    MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
4283 
4284    Not Collective
4285 
4286    Input Parameter:
4287 .   A - a MATSEQAIJCUSPARSE matrix
4288 
4289    Output Parameter:
4290 .   a - pointer to the device data
4291 
4292    Level: developer
4293 
4294    Notes: does not trigger host-device copies and flags data validity on the GPU
4295 
4296 .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSERestoreArrayWrite()
4297 @*/
4298 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a)
4299 {
4300   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4301   CsrMatrix          *csr;
4302 
4303   PetscFunctionBegin;
4304   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4305   PetscValidPointer(a,2);
4306   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4307   PetscCheckFalse(cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4308   PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4309   csr = (CsrMatrix*)cusp->mat->mat;
4310   PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4311   *a = csr->values->data().get();
4312   A->offloadmask = PETSC_OFFLOAD_GPU;
4313   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE));
4314   PetscFunctionReturn(0);
4315 }
4316 
4317 /*@C
4318    MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from MatSeqAIJCUSPARSEGetArrayWrite()
4319 
4320    Not Collective
4321 
4322    Input Parameter:
4323 .   A - a MATSEQAIJCUSPARSE matrix
4324 
4325    Output Parameter:
4326 .   a - pointer to the device data
4327 
4328    Level: developer
4329 
4330 .seealso: MatSeqAIJCUSPARSEGetArrayWrite()
4331 @*/
4332 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a)
4333 {
4334   PetscFunctionBegin;
4335   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4336   PetscValidPointer(a,2);
4337   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4338   PetscCall(MatSeqAIJInvalidateDiagonal(A));
4339   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4340   *a = NULL;
4341   PetscFunctionReturn(0);
4342 }
4343 
4344 struct IJCompare4
4345 {
4346   __host__ __device__
4347   inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4348   {
4349     if (t1.get<0>() < t2.get<0>()) return true;
4350     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
4351     return false;
4352   }
4353 };
4354 
4355 struct Shift
4356 {
4357   int _shift;
4358 
4359   Shift(int shift) : _shift(shift) {}
4360   __host__ __device__
4361   inline int operator() (const int &c)
4362   {
4363     return c + _shift;
4364   }
4365 };
4366 
4367 /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */
4368 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
4369 {
4370   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c;
4371   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp;
4372   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
4373   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
4374   PetscInt                     Annz,Bnnz;
4375   cusparseStatus_t             stat;
4376   PetscInt                     i,m,n,zero = 0;
4377 
4378   PetscFunctionBegin;
4379   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4380   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
4381   PetscValidPointer(C,4);
4382   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4383   PetscCheckTypeName(B,MATSEQAIJCUSPARSE);
4384   PetscCheck(A->rmap->n == B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n);
4385   PetscCheckFalse(reuse == MAT_INPLACE_MATRIX,PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
4386   PetscCheckFalse(Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4387   PetscCheckFalse(Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4388   if (reuse == MAT_INITIAL_MATRIX) {
4389     m     = A->rmap->n;
4390     n     = A->cmap->n + B->cmap->n;
4391     PetscCall(MatCreate(PETSC_COMM_SELF,C));
4392     PetscCall(MatSetSizes(*C,m,n,m,n));
4393     PetscCall(MatSetType(*C,MATSEQAIJCUSPARSE));
4394     c     = (Mat_SeqAIJ*)(*C)->data;
4395     Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4396     Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
4397     Ccsr  = new CsrMatrix;
4398     Cmat->cprowIndices      = NULL;
4399     c->compressedrow.use    = PETSC_FALSE;
4400     c->compressedrow.nrows  = 0;
4401     c->compressedrow.i      = NULL;
4402     c->compressedrow.rindex = NULL;
4403     Ccusp->workVector       = NULL;
4404     Ccusp->nrows    = m;
4405     Ccusp->mat      = Cmat;
4406     Ccusp->mat->mat = Ccsr;
4407     Ccsr->num_rows  = m;
4408     Ccsr->num_cols  = n;
4409     PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
4410     PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
4411     PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4412     PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar)));
4413     PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar)));
4414     PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar)));
4415     PetscCallCUDA(cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
4416     PetscCallCUDA(cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice));
4417     PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
4418     PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4419     PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
4420     PetscCheck(Acusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4421     PetscCheck(Bcusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4422 
4423     Acsr = (CsrMatrix*)Acusp->mat->mat;
4424     Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4425     Annz = (PetscInt)Acsr->column_indices->size();
4426     Bnnz = (PetscInt)Bcsr->column_indices->size();
4427     c->nz = Annz + Bnnz;
4428     Ccsr->row_offsets = new THRUSTINTARRAY32(m+1);
4429     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4430     Ccsr->values = new THRUSTARRAY(c->nz);
4431     Ccsr->num_entries = c->nz;
4432     Ccusp->cooPerm = new THRUSTINTARRAY(c->nz);
4433     if (c->nz) {
4434       auto Acoo = new THRUSTINTARRAY32(Annz);
4435       auto Bcoo = new THRUSTINTARRAY32(Bnnz);
4436       auto Ccoo = new THRUSTINTARRAY32(c->nz);
4437       THRUSTINTARRAY32 *Aroff,*Broff;
4438 
4439       if (a->compressedrow.use) { /* need full row offset */
4440         if (!Acusp->rowoffsets_gpu) {
4441           Acusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
4442           Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
4443           PetscCall(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt)));
4444         }
4445         Aroff = Acusp->rowoffsets_gpu;
4446       } else Aroff = Acsr->row_offsets;
4447       if (b->compressedrow.use) { /* need full row offset */
4448         if (!Bcusp->rowoffsets_gpu) {
4449           Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
4450           Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
4451           PetscCall(PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt)));
4452         }
4453         Broff = Bcusp->rowoffsets_gpu;
4454       } else Broff = Bcsr->row_offsets;
4455       PetscCall(PetscLogGpuTimeBegin());
4456       stat = cusparseXcsr2coo(Acusp->handle,
4457                               Aroff->data().get(),
4458                               Annz,
4459                               m,
4460                               Acoo->data().get(),
4461                               CUSPARSE_INDEX_BASE_ZERO);PetscCallCUSPARSE(stat);
4462       stat = cusparseXcsr2coo(Bcusp->handle,
4463                               Broff->data().get(),
4464                               Bnnz,
4465                               m,
4466                               Bcoo->data().get(),
4467                               CUSPARSE_INDEX_BASE_ZERO);PetscCallCUSPARSE(stat);
4468       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
4469       auto Aperm = thrust::make_constant_iterator(1);
4470       auto Bperm = thrust::make_constant_iterator(0);
4471 #if PETSC_PKG_CUDA_VERSION_GE(10,0,0)
4472       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n));
4473       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n));
4474 #else
4475       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
4476       auto Bcib = Bcsr->column_indices->begin();
4477       auto Bcie = Bcsr->column_indices->end();
4478       thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n));
4479 #endif
4480       auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz);
4481       auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm));
4482       auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm));
4483       auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm));
4484       auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm));
4485       auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin()));
4486       auto p1 = Ccusp->cooPerm->begin();
4487       auto p2 = Ccusp->cooPerm->begin();
4488       thrust::advance(p2,Annz);
4489       PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4()));
4490 #if PETSC_PKG_CUDA_VERSION_LT(10,0,0)
4491       thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n));
4492 #endif
4493       auto cci = thrust::make_counting_iterator(zero);
4494       auto cce = thrust::make_counting_iterator(c->nz);
4495 #if 0 //Errors on SUMMIT cuda 11.1.0
4496       PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
4497 #else
4498       auto pred = thrust::identity<int>();
4499       PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred));
4500       PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred));
4501 #endif
4502       stat = cusparseXcoo2csr(Ccusp->handle,
4503                               Ccoo->data().get(),
4504                               c->nz,
4505                               m,
4506                               Ccsr->row_offsets->data().get(),
4507                               CUSPARSE_INDEX_BASE_ZERO);PetscCallCUSPARSE(stat);
4508       PetscCall(PetscLogGpuTimeEnd());
4509       delete wPerm;
4510       delete Acoo;
4511       delete Bcoo;
4512       delete Ccoo;
4513 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4514       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries,
4515                                Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(),
4516                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4517                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat);
4518 #endif
4519       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
4520         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
4521         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
4522         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4523         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4524         CsrMatrix *CcsrT = new CsrMatrix;
4525         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4526         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4527 
4528         (*C)->form_explicit_transpose = PETSC_TRUE;
4529         (*C)->transupdated = PETSC_TRUE;
4530         Ccusp->rowoffsets_gpu = NULL;
4531         CmatT->cprowIndices = NULL;
4532         CmatT->mat = CcsrT;
4533         CcsrT->num_rows = n;
4534         CcsrT->num_cols = m;
4535         CcsrT->num_entries = c->nz;
4536 
4537         CcsrT->row_offsets = new THRUSTINTARRAY32(n+1);
4538         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4539         CcsrT->values = new THRUSTARRAY(c->nz);
4540 
4541         PetscCall(PetscLogGpuTimeBegin());
4542         auto rT = CcsrT->row_offsets->begin();
4543         if (AT) {
4544           rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT);
4545           thrust::advance(rT,-1);
4546         }
4547         if (BT) {
4548           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz));
4549           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz));
4550           thrust::copy(titb,tite,rT);
4551         }
4552         auto cT = CcsrT->column_indices->begin();
4553         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT);
4554         if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT);
4555         auto vT = CcsrT->values->begin();
4556         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4557         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4558         PetscCall(PetscLogGpuTimeEnd());
4559 
4560         PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr));
4561         PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO));
4562         PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4563         PetscCallCUDA(cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar)));
4564         PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar)));
4565         PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar)));
4566         PetscCallCUDA(cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
4567         PetscCallCUDA(cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice));
4568         PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
4569 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4570         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries,
4571                                  CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(),
4572                                  CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4573                                  CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat);
4574 #endif
4575         Ccusp->matTranspose = CmatT;
4576       }
4577     }
4578 
4579     c->singlemalloc = PETSC_FALSE;
4580     c->free_a       = PETSC_TRUE;
4581     c->free_ij      = PETSC_TRUE;
4582     PetscCall(PetscMalloc1(m+1,&c->i));
4583     PetscCall(PetscMalloc1(c->nz,&c->j));
4584     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
4585       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4586       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4587       ii   = *Ccsr->row_offsets;
4588       jj   = *Ccsr->column_indices;
4589       PetscCallCUDA(cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
4590       PetscCallCUDA(cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
4591     } else {
4592       PetscCallCUDA(cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
4593       PetscCallCUDA(cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
4594     }
4595     PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt)));
4596     PetscCall(PetscMalloc1(m,&c->ilen));
4597     PetscCall(PetscMalloc1(m,&c->imax));
4598     c->maxnz = c->nz;
4599     c->nonzerorowcnt = 0;
4600     c->rmax = 0;
4601     for (i = 0; i < m; i++) {
4602       const PetscInt nn = c->i[i+1] - c->i[i];
4603       c->ilen[i] = c->imax[i] = nn;
4604       c->nonzerorowcnt += (PetscInt)!!nn;
4605       c->rmax = PetscMax(c->rmax,nn);
4606     }
4607     PetscCall(MatMarkDiagonal_SeqAIJ(*C));
4608     PetscCall(PetscMalloc1(c->nz,&c->a));
4609     (*C)->nonzerostate++;
4610     PetscCall(PetscLayoutSetUp((*C)->rmap));
4611     PetscCall(PetscLayoutSetUp((*C)->cmap));
4612     Ccusp->nonzerostate = (*C)->nonzerostate;
4613     (*C)->preallocated  = PETSC_TRUE;
4614   } else {
4615     PetscCheckFalse((*C)->rmap->n != B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,(*C)->rmap->n,B->rmap->n);
4616     c = (Mat_SeqAIJ*)(*C)->data;
4617     if (c->nz) {
4618       Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4619       PetscCheck(Ccusp->cooPerm,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm");
4620       PetscCheckFalse(Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4621       PetscCheckFalse(Ccusp->nonzerostate != (*C)->nonzerostate,PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate");
4622       PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4623       PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
4624       PetscCheck(Acusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4625       PetscCheck(Bcusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4626       Acsr = (CsrMatrix*)Acusp->mat->mat;
4627       Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4628       Ccsr = (CsrMatrix*)Ccusp->mat->mat;
4629       PetscCheckFalse(Acsr->num_entries != (PetscInt)Acsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %" PetscInt_FMT " != %" PetscInt_FMT,Acsr->num_entries,(PetscInt)Acsr->values->size());
4630       PetscCheckFalse(Bcsr->num_entries != (PetscInt)Bcsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %" PetscInt_FMT " != %" PetscInt_FMT,Bcsr->num_entries,(PetscInt)Bcsr->values->size());
4631       PetscCheckFalse(Ccsr->num_entries != (PetscInt)Ccsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %" PetscInt_FMT " != %" PetscInt_FMT,Ccsr->num_entries,(PetscInt)Ccsr->values->size());
4632       PetscCheckFalse(Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries,PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT,Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries);
4633       PetscCheck(Ccusp->cooPerm->size() == Ccsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %" PetscInt_FMT " != %" PetscInt_FMT,(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size());
4634       auto pmid = Ccusp->cooPerm->begin();
4635       thrust::advance(pmid,Acsr->num_entries);
4636       PetscCall(PetscLogGpuTimeBegin());
4637       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(),
4638                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin())));
4639       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(),
4640                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4641       thrust::for_each(zibait,zieait,VecCUDAEquals());
4642       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(),
4643                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4644       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(),
4645                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end())));
4646       thrust::for_each(zibbit,ziebit,VecCUDAEquals());
4647       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE));
4648       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
4649         PetscCheck(Ccusp->matTranspose,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4650         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4651         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4652         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4653         CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat;
4654         auto vT = CcsrT->values->begin();
4655         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4656         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4657         (*C)->transupdated = PETSC_TRUE;
4658       }
4659       PetscCall(PetscLogGpuTimeEnd());
4660     }
4661   }
4662   PetscCall(PetscObjectStateIncrease((PetscObject)*C));
4663   (*C)->assembled     = PETSC_TRUE;
4664   (*C)->was_assembled = PETSC_FALSE;
4665   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4666   PetscFunctionReturn(0);
4667 }
4668 
4669 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4670 {
4671   bool              dmem;
4672   const PetscScalar *av;
4673 
4674   PetscFunctionBegin;
4675   dmem = isCudaMem(v);
4676   PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A,&av));
4677   if (n && idx) {
4678     THRUSTINTARRAY widx(n);
4679     widx.assign(idx,idx+n);
4680     PetscCall(PetscLogCpuToGpu(n*sizeof(PetscInt)));
4681 
4682     THRUSTARRAY *w = NULL;
4683     thrust::device_ptr<PetscScalar> dv;
4684     if (dmem) {
4685       dv = thrust::device_pointer_cast(v);
4686     } else {
4687       w = new THRUSTARRAY(n);
4688       dv = w->data();
4689     }
4690     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
4691 
4692     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv));
4693     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n));
4694     thrust::for_each(zibit,zieit,VecCUDAEquals());
4695     if (w) {
4696       PetscCallCUDA(cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost));
4697     }
4698     delete w;
4699   } else {
4700     PetscCallCUDA(cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost));
4701   }
4702   if (!dmem) PetscCall(PetscLogCpuToGpu(n*sizeof(PetscScalar)));
4703   PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A,&av));
4704   PetscFunctionReturn(0);
4705 }
4706