xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 4ffacfe27a72f4cdf51b68a3bbb6aed96040fb2f)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
7 
8 #include <petscconf.h>
9 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
10 #include <../src/mat/impls/sbaij/seq/sbaij.h>
11 #include <../src/vec/vec/impls/dvecimpl.h>
12 #include <petsc/private/vecimpl.h>
13 #undef VecType
14 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
15 #include <thrust/adjacent_difference.h>
16 #include <thrust/async/for_each.h>
17 #include <thrust/iterator/constant_iterator.h>
18 #include <thrust/remove.h>
19 #include <thrust/sort.h>
20 #include <thrust/unique.h>
21 
22 const char *const MatCUSPARSEStorageFormats[]    = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
23 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
24   /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
25     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
26 
27   typedef enum {
28       CUSPARSE_MV_ALG_DEFAULT = 0,
29       CUSPARSE_COOMV_ALG      = 1,
30       CUSPARSE_CSRMV_ALG1     = 2,
31       CUSPARSE_CSRMV_ALG2     = 3
32   } cusparseSpMVAlg_t;
33 
34   typedef enum {
35       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
36       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
37       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
38       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
39       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
40       CUSPARSE_SPMM_ALG_DEFAULT = 0,
41       CUSPARSE_SPMM_COO_ALG1    = 1,
42       CUSPARSE_SPMM_COO_ALG2    = 2,
43       CUSPARSE_SPMM_COO_ALG3    = 3,
44       CUSPARSE_SPMM_COO_ALG4    = 5,
45       CUSPARSE_SPMM_CSR_ALG1    = 4,
46       CUSPARSE_SPMM_CSR_ALG2    = 6,
47   } cusparseSpMMAlg_t;
48 
49   typedef enum {
50       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
51       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
52   } cusparseCsr2CscAlg_t;
53   */
54   const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
55   const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
56   const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
57 #endif
58 
59 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
60 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
61 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
62 
63 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
64 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
65 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
66 
67 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
68 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
69 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
70 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
71 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
72 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure);
73 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar);
74 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
75 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
76 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
77 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
78 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
79 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
80 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
81 
82 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
83 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
84 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
85 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
86 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
87 
88 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
89 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool);
90 
91 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]);
92 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscCount,const PetscInt[],const PetscInt[]);
93 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode);
94 
95 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
96 {
97   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
98 
99   PetscFunctionBegin;
100   switch (op) {
101   case MAT_CUSPARSE_MULT:
102     cusparsestruct->format = format;
103     break;
104   case MAT_CUSPARSE_ALL:
105     cusparsestruct->format = format;
106     break;
107   default:
108     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
109   }
110   PetscFunctionReturn(0);
111 }
112 
113 /*@
114    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
115    operation. Only the MatMult operation can use different GPU storage formats
116    for MPIAIJCUSPARSE matrices.
117    Not Collective
118 
119    Input Parameters:
120 +  A - Matrix of type SEQAIJCUSPARSE
121 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
122 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
123 
124    Output Parameter:
125 
126    Level: intermediate
127 
128 .seealso: `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
129 @*/
130 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
131 {
132   PetscFunctionBegin;
133   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
134   PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));
135   PetscFunctionReturn(0);
136 }
137 
138 PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A,PetscBool use_cpu)
139 {
140   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
141 
142   PetscFunctionBegin;
143   cusparsestruct->use_cpu_solve = use_cpu;
144   PetscFunctionReturn(0);
145 }
146 
147 /*@
148    MatCUSPARSESetUseCPUSolve - Sets use CPU MatSolve.
149 
150    Input Parameters:
151 +  A - Matrix of type SEQAIJCUSPARSE
152 -  use_cpu - set flag for using the built-in CPU MatSolve
153 
154    Output Parameter:
155 
156    Notes:
157    The cuSparse LU solver currently computes the factors with the built-in CPU method
158    and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there.
159    This method to specify if the solve is done on the CPU or GPU (GPU is the default).
160 
161    Level: intermediate
162 
163 .seealso: `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
164 @*/
165 PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A,PetscBool use_cpu)
166 {
167   PetscFunctionBegin;
168   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
169   PetscTryMethod(A,"MatCUSPARSESetUseCPUSolve_C",(Mat,PetscBool),(A,use_cpu));
170   PetscFunctionReturn(0);
171 }
172 
173 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg)
174 {
175   PetscFunctionBegin;
176   switch (op) {
177     case MAT_FORM_EXPLICIT_TRANSPOSE:
178       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
179       if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE));
180       A->form_explicit_transpose = flg;
181       break;
182     default:
183       PetscCall(MatSetOption_SeqAIJ(A,op,flg));
184       break;
185   }
186   PetscFunctionReturn(0);
187 }
188 
189 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A);
190 
191 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
192 {
193   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
194   IS             isrow = b->row,iscol = b->col;
195   PetscBool      row_identity,col_identity;
196   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)B->spptr;
197 
198   PetscFunctionBegin;
199   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
200   PetscCall(MatLUFactorNumeric_SeqAIJ(B,A,info));
201   B->offloadmask = PETSC_OFFLOAD_CPU;
202   /* determine which version of MatSolve needs to be used. */
203   PetscCall(ISIdentity(isrow,&row_identity));
204   PetscCall(ISIdentity(iscol,&col_identity));
205 
206   if (!cusparsestruct->use_cpu_solve) {
207     if (row_identity && col_identity) {
208       B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
209       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
210     } else {
211       B->ops->solve = MatSolve_SeqAIJCUSPARSE;
212       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
213     }
214   }
215   B->ops->matsolve = NULL;
216   B->ops->matsolvetranspose = NULL;
217 
218   /* get the triangular factors */
219   if (!cusparsestruct->use_cpu_solve) {
220     PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B));
221   }
222   PetscFunctionReturn(0);
223 }
224 
225 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
226 {
227   MatCUSPARSEStorageFormat format;
228   PetscBool                flg;
229   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
230 
231   PetscFunctionBegin;
232   PetscOptionsHeadBegin(PetscOptionsObject,"SeqAIJCUSPARSE options");
233   if (A->factortype == MAT_FACTOR_NONE) {
234     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
235                                "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg));
236     if (flg) PetscCall(MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format));
237 
238     PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
239                                "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg));
240     if (flg) PetscCall(MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format));
241     PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve","Use CPU (I)LU solve","MatCUSPARSESetUseCPUSolve",cusparsestruct->use_cpu_solve,&cusparsestruct->use_cpu_solve,&flg));
242     if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A,cusparsestruct->use_cpu_solve));
243 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
244     PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
245                                "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg));
246     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
247 #if CUSPARSE_VERSION > 11301
248     PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
249 #else
250     PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
251 #endif
252     PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
253                                "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg));
254     PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
255 
256     PetscCall(PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
257                                "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg));
258     PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
259    #endif
260   }
261   PetscOptionsHeadEnd();
262   PetscFunctionReturn(0);
263 }
264 
265 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
266 {
267   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
268   PetscInt                          n = A->rmap->n;
269   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
270   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
271   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
272   const MatScalar                   *aa = a->a,*v;
273   PetscInt                          *AiLo, *AjLo;
274   PetscInt                          i,nz, nzLower, offset, rowOffset;
275 
276   PetscFunctionBegin;
277   if (!n) PetscFunctionReturn(0);
278   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
279     try {
280       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
281       nzLower=n+ai[n]-ai[1];
282       if (!loTriFactor) {
283         PetscScalar                       *AALo;
284 
285         PetscCallCUDA(cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar)));
286 
287         /* Allocate Space for the lower triangular matrix */
288         PetscCallCUDA(cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt)));
289         PetscCallCUDA(cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt)));
290 
291         /* Fill the lower triangular matrix */
292         AiLo[0]  = (PetscInt) 0;
293         AiLo[n]  = nzLower;
294         AjLo[0]  = (PetscInt) 0;
295         AALo[0]  = (MatScalar) 1.0;
296         v        = aa;
297         vi       = aj;
298         offset   = 1;
299         rowOffset= 1;
300         for (i=1; i<n; i++) {
301           nz = ai[i+1] - ai[i];
302           /* additional 1 for the term on the diagonal */
303           AiLo[i]    = rowOffset;
304           rowOffset += nz+1;
305 
306           PetscCall(PetscArraycpy(&(AjLo[offset]), vi, nz));
307           PetscCall(PetscArraycpy(&(AALo[offset]), v, nz));
308 
309           offset      += nz;
310           AjLo[offset] = (PetscInt) i;
311           AALo[offset] = (MatScalar) 1.0;
312           offset      += 1;
313 
314           v  += nz;
315           vi += nz;
316         }
317 
318         /* allocate space for the triangular factor information */
319         PetscCall(PetscNew(&loTriFactor));
320         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
321         /* Create the matrix description */
322         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
323         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
324        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
325         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
326        #else
327         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
328        #endif
329         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER));
330         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
331 
332         /* set the operation */
333         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
334 
335         /* set the matrix */
336         loTriFactor->csrMat = new CsrMatrix;
337         loTriFactor->csrMat->num_rows = n;
338         loTriFactor->csrMat->num_cols = n;
339         loTriFactor->csrMat->num_entries = nzLower;
340 
341         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
342         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
343 
344         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
345         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
346 
347         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
348         loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
349 
350         /* Create the solve analysis information */
351         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
352         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
353       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
354         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
355                                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
356                                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
357                                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
358                                                &loTriFactor->solveBufferSize));
359         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize));
360       #endif
361 
362         /* perform the solve analysis */
363         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
364                                          loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
365                                          loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
366                                          loTriFactor->csrMat->column_indices->data().get(),
367                                          #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
368                                          loTriFactor->solveInfo,
369                                          loTriFactor->solvePolicy, loTriFactor->solveBuffer));
370                                          #else
371                                          loTriFactor->solveInfo));
372                                          #endif
373         PetscCallCUDA(WaitForCUDA());
374         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
375 
376         /* assign the pointer */
377         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
378         loTriFactor->AA_h = AALo;
379         PetscCallCUDA(cudaFreeHost(AiLo));
380         PetscCallCUDA(cudaFreeHost(AjLo));
381         PetscCall(PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar)));
382       } else { /* update values only */
383         if (!loTriFactor->AA_h) {
384           PetscCallCUDA(cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar)));
385         }
386         /* Fill the lower triangular matrix */
387         loTriFactor->AA_h[0]  = 1.0;
388         v        = aa;
389         vi       = aj;
390         offset   = 1;
391         for (i=1; i<n; i++) {
392           nz = ai[i+1] - ai[i];
393           PetscCall(PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz));
394           offset      += nz;
395           loTriFactor->AA_h[offset] = 1.0;
396           offset      += 1;
397           v  += nz;
398         }
399         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower);
400         PetscCall(PetscLogCpuToGpu(nzLower*sizeof(PetscScalar)));
401       }
402     } catch(char *ex) {
403       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
404     }
405   }
406   PetscFunctionReturn(0);
407 }
408 
409 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
410 {
411   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
412   PetscInt                          n = A->rmap->n;
413   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
414   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
415   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
416   const MatScalar                   *aa = a->a,*v;
417   PetscInt                          *AiUp, *AjUp;
418   PetscInt                          i,nz, nzUpper, offset;
419 
420   PetscFunctionBegin;
421   if (!n) PetscFunctionReturn(0);
422   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
423     try {
424       /* next, figure out the number of nonzeros in the upper triangular matrix. */
425       nzUpper = adiag[0]-adiag[n];
426       if (!upTriFactor) {
427         PetscScalar *AAUp;
428 
429         PetscCallCUDA(cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar)));
430 
431         /* Allocate Space for the upper triangular matrix */
432         PetscCallCUDA(cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt)));
433         PetscCallCUDA(cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt)));
434 
435         /* Fill the upper triangular matrix */
436         AiUp[0]=(PetscInt) 0;
437         AiUp[n]=nzUpper;
438         offset = nzUpper;
439         for (i=n-1; i>=0; i--) {
440           v  = aa + adiag[i+1] + 1;
441           vi = aj + adiag[i+1] + 1;
442 
443           /* number of elements NOT on the diagonal */
444           nz = adiag[i] - adiag[i+1]-1;
445 
446           /* decrement the offset */
447           offset -= (nz+1);
448 
449           /* first, set the diagonal elements */
450           AjUp[offset] = (PetscInt) i;
451           AAUp[offset] = (MatScalar)1./v[nz];
452           AiUp[i]      = AiUp[i+1] - (nz+1);
453 
454           PetscCall(PetscArraycpy(&(AjUp[offset+1]), vi, nz));
455           PetscCall(PetscArraycpy(&(AAUp[offset+1]), v, nz));
456         }
457 
458         /* allocate space for the triangular factor information */
459         PetscCall(PetscNew(&upTriFactor));
460         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
461 
462         /* Create the matrix description */
463         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
464         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
465        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
466         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
467        #else
468         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
469        #endif
470         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
471         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
472 
473         /* set the operation */
474         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
475 
476         /* set the matrix */
477         upTriFactor->csrMat = new CsrMatrix;
478         upTriFactor->csrMat->num_rows = n;
479         upTriFactor->csrMat->num_cols = n;
480         upTriFactor->csrMat->num_entries = nzUpper;
481 
482         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
483         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
484 
485         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
486         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
487 
488         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
489         upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
490 
491         /* Create the solve analysis information */
492         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
493         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
494       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
495         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
496                                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
497                                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
498                                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
499                                                &upTriFactor->solveBufferSize));
500         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize));
501       #endif
502 
503         /* perform the solve analysis */
504         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
505                                          upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
506                                          upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
507                                          upTriFactor->csrMat->column_indices->data().get(),
508                                          #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
509                                          upTriFactor->solveInfo,
510                                          upTriFactor->solvePolicy, upTriFactor->solveBuffer));
511                                          #else
512                                          upTriFactor->solveInfo));
513                                          #endif
514         PetscCallCUDA(WaitForCUDA());
515         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
516 
517         /* assign the pointer */
518         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
519         upTriFactor->AA_h = AAUp;
520         PetscCallCUDA(cudaFreeHost(AiUp));
521         PetscCallCUDA(cudaFreeHost(AjUp));
522         PetscCall(PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar)));
523       } else {
524         if (!upTriFactor->AA_h) {
525           PetscCallCUDA(cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar)));
526         }
527         /* Fill the upper triangular matrix */
528         offset = nzUpper;
529         for (i=n-1; i>=0; i--) {
530           v  = aa + adiag[i+1] + 1;
531 
532           /* number of elements NOT on the diagonal */
533           nz = adiag[i] - adiag[i+1]-1;
534 
535           /* decrement the offset */
536           offset -= (nz+1);
537 
538           /* first, set the diagonal elements */
539           upTriFactor->AA_h[offset] = 1./v[nz];
540           PetscCall(PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz));
541         }
542         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper);
543         PetscCall(PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar)));
544       }
545     } catch(char *ex) {
546       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
547     }
548   }
549   PetscFunctionReturn(0);
550 }
551 
552 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
553 {
554   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
555   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
556   IS                           isrow = a->row,iscol = a->icol;
557   PetscBool                    row_identity,col_identity;
558   PetscInt                     n = A->rmap->n;
559 
560   PetscFunctionBegin;
561   PetscCheck(cusparseTriFactors,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
562   PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A));
563   PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A));
564 
565   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
566   cusparseTriFactors->nnz=a->nz;
567 
568   A->offloadmask = PETSC_OFFLOAD_BOTH;
569   /* lower triangular indices */
570   PetscCall(ISIdentity(isrow,&row_identity));
571   if (!row_identity && !cusparseTriFactors->rpermIndices) {
572     const PetscInt *r;
573 
574     PetscCall(ISGetIndices(isrow,&r));
575     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
576     cusparseTriFactors->rpermIndices->assign(r, r+n);
577     PetscCall(ISRestoreIndices(isrow,&r));
578     PetscCall(PetscLogCpuToGpu(n*sizeof(PetscInt)));
579   }
580 
581   /* upper triangular indices */
582   PetscCall(ISIdentity(iscol,&col_identity));
583   if (!col_identity && !cusparseTriFactors->cpermIndices) {
584     const PetscInt *c;
585 
586     PetscCall(ISGetIndices(iscol,&c));
587     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
588     cusparseTriFactors->cpermIndices->assign(c, c+n);
589     PetscCall(ISRestoreIndices(iscol,&c));
590     PetscCall(PetscLogCpuToGpu(n*sizeof(PetscInt)));
591   }
592   PetscFunctionReturn(0);
593 }
594 
595 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
596 {
597   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
598   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
599   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
600   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
601   PetscInt                          *AiUp, *AjUp;
602   PetscScalar                       *AAUp;
603   PetscScalar                       *AALo;
604   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
605   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
606   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
607   const MatScalar                   *aa = b->a,*v;
608 
609   PetscFunctionBegin;
610   if (!n) PetscFunctionReturn(0);
611   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
612     try {
613       PetscCallCUDA(cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar)));
614       PetscCallCUDA(cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar)));
615       if (!upTriFactor && !loTriFactor) {
616         /* Allocate Space for the upper triangular matrix */
617         PetscCallCUDA(cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt)));
618         PetscCallCUDA(cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt)));
619 
620         /* Fill the upper triangular matrix */
621         AiUp[0]=(PetscInt) 0;
622         AiUp[n]=nzUpper;
623         offset = 0;
624         for (i=0; i<n; i++) {
625           /* set the pointers */
626           v  = aa + ai[i];
627           vj = aj + ai[i];
628           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
629 
630           /* first, set the diagonal elements */
631           AjUp[offset] = (PetscInt) i;
632           AAUp[offset] = (MatScalar)1.0/v[nz];
633           AiUp[i]      = offset;
634           AALo[offset] = (MatScalar)1.0/v[nz];
635 
636           offset+=1;
637           if (nz>0) {
638             PetscCall(PetscArraycpy(&(AjUp[offset]), vj, nz));
639             PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz));
640             for (j=offset; j<offset+nz; j++) {
641               AAUp[j] = -AAUp[j];
642               AALo[j] = AAUp[j]/v[nz];
643             }
644             offset+=nz;
645           }
646         }
647 
648         /* allocate space for the triangular factor information */
649         PetscCall(PetscNew(&upTriFactor));
650         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
651 
652         /* Create the matrix description */
653         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
654         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
655        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
656         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
657        #else
658         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
659        #endif
660         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
661         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
662 
663         /* set the matrix */
664         upTriFactor->csrMat = new CsrMatrix;
665         upTriFactor->csrMat->num_rows = A->rmap->n;
666         upTriFactor->csrMat->num_cols = A->cmap->n;
667         upTriFactor->csrMat->num_entries = a->nz;
668 
669         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
670         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
671 
672         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
673         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
674 
675         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
676         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
677 
678         /* set the operation */
679         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
680 
681         /* Create the solve analysis information */
682         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
683         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
684       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
685         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
686                                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
687                                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
688                                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
689                                                &upTriFactor->solveBufferSize));
690         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize));
691       #endif
692 
693         /* perform the solve analysis */
694         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
695                                          upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
696                                          upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
697                                          upTriFactor->csrMat->column_indices->data().get(),
698                                          #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
699                                          upTriFactor->solveInfo,
700                                          upTriFactor->solvePolicy, upTriFactor->solveBuffer));
701                                          #else
702                                          upTriFactor->solveInfo));
703                                          #endif
704         PetscCallCUDA(WaitForCUDA());
705         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
706 
707         /* assign the pointer */
708         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
709 
710         /* allocate space for the triangular factor information */
711         PetscCall(PetscNew(&loTriFactor));
712         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
713 
714         /* Create the matrix description */
715         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
716         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
717        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
718         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
719        #else
720         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
721        #endif
722         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
723         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
724 
725         /* set the operation */
726         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
727 
728         /* set the matrix */
729         loTriFactor->csrMat = new CsrMatrix;
730         loTriFactor->csrMat->num_rows = A->rmap->n;
731         loTriFactor->csrMat->num_cols = A->cmap->n;
732         loTriFactor->csrMat->num_entries = a->nz;
733 
734         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
735         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
736 
737         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
738         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
739 
740         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
741         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
742 
743         /* Create the solve analysis information */
744         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
745         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
746       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
747         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
748                                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
749                                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
750                                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
751                                                &loTriFactor->solveBufferSize));
752         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize));
753       #endif
754 
755         /* perform the solve analysis */
756         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
757                                          loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
758                                          loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
759                                          loTriFactor->csrMat->column_indices->data().get(),
760                                          #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
761                                          loTriFactor->solveInfo,
762                                          loTriFactor->solvePolicy, loTriFactor->solveBuffer));
763                                          #else
764                                          loTriFactor->solveInfo));
765                                          #endif
766         PetscCallCUDA(WaitForCUDA());
767         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
768 
769         /* assign the pointer */
770         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
771 
772         PetscCall(PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar))));
773         PetscCallCUDA(cudaFreeHost(AiUp));
774         PetscCallCUDA(cudaFreeHost(AjUp));
775       } else {
776         /* Fill the upper triangular matrix */
777         offset = 0;
778         for (i=0; i<n; i++) {
779           /* set the pointers */
780           v  = aa + ai[i];
781           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
782 
783           /* first, set the diagonal elements */
784           AAUp[offset] = 1.0/v[nz];
785           AALo[offset] = 1.0/v[nz];
786 
787           offset+=1;
788           if (nz>0) {
789             PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz));
790             for (j=offset; j<offset+nz; j++) {
791               AAUp[j] = -AAUp[j];
792               AALo[j] = AAUp[j]/v[nz];
793             }
794             offset+=nz;
795           }
796         }
797         PetscCheck(upTriFactor,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
798         PetscCheck(loTriFactor,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
799         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
800         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
801         PetscCall(PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar)));
802       }
803       PetscCallCUDA(cudaFreeHost(AAUp));
804       PetscCallCUDA(cudaFreeHost(AALo));
805     } catch(char *ex) {
806       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
807     }
808   }
809   PetscFunctionReturn(0);
810 }
811 
812 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
813 {
814   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
815   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
816   IS                           ip = a->row;
817   PetscBool                    perm_identity;
818   PetscInt                     n = A->rmap->n;
819 
820   PetscFunctionBegin;
821   PetscCheck(cusparseTriFactors,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
822   PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A));
823   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
824   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
825 
826   A->offloadmask = PETSC_OFFLOAD_BOTH;
827 
828   /* lower triangular indices */
829   PetscCall(ISIdentity(ip,&perm_identity));
830   if (!perm_identity) {
831     IS             iip;
832     const PetscInt *irip,*rip;
833 
834     PetscCall(ISInvertPermutation(ip,PETSC_DECIDE,&iip));
835     PetscCall(ISGetIndices(iip,&irip));
836     PetscCall(ISGetIndices(ip,&rip));
837     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
838     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
839     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
840     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
841     PetscCall(ISRestoreIndices(iip,&irip));
842     PetscCall(ISDestroy(&iip));
843     PetscCall(ISRestoreIndices(ip,&rip));
844     PetscCall(PetscLogCpuToGpu(2.*n*sizeof(PetscInt)));
845   }
846   PetscFunctionReturn(0);
847 }
848 
849 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
850 {
851   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
852   IS             ip = b->row;
853   PetscBool      perm_identity;
854 
855   PetscFunctionBegin;
856   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
857   PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B,A,info));
858   B->offloadmask = PETSC_OFFLOAD_CPU;
859   /* determine which version of MatSolve needs to be used. */
860   PetscCall(ISIdentity(ip,&perm_identity));
861   if (perm_identity) {
862     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
863     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
864     B->ops->matsolve = NULL;
865     B->ops->matsolvetranspose = NULL;
866   } else {
867     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
868     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
869     B->ops->matsolve = NULL;
870     B->ops->matsolvetranspose = NULL;
871   }
872 
873   /* get the triangular factors */
874   PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B));
875   PetscFunctionReturn(0);
876 }
877 
878 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
879 {
880   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
881   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
882   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
883   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
884   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
885   cusparseIndexBase_t               indexBase;
886   cusparseMatrixType_t              matrixType;
887   cusparseFillMode_t                fillMode;
888   cusparseDiagType_t                diagType;
889 
890   PetscFunctionBegin;
891   /* allocate space for the transpose of the lower triangular factor */
892   PetscCall(PetscNew(&loTriFactorT));
893   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
894 
895   /* set the matrix descriptors of the lower triangular factor */
896   matrixType = cusparseGetMatType(loTriFactor->descr);
897   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
898   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
899     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
900   diagType = cusparseGetMatDiagType(loTriFactor->descr);
901 
902   /* Create the matrix description */
903   PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr));
904   PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase));
905   PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType));
906   PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode));
907   PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType));
908 
909   /* set the operation */
910   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
911 
912   /* allocate GPU space for the CSC of the lower triangular factor*/
913   loTriFactorT->csrMat = new CsrMatrix;
914   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
915   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
916   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
917   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
918   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
919   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
920 
921   /* compute the transpose of the lower triangular factor, i.e. the CSC */
922 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
923   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
924                                                loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
925                                                loTriFactor->csrMat->values->data().get(),
926                                                loTriFactor->csrMat->row_offsets->data().get(),
927                                                loTriFactor->csrMat->column_indices->data().get(),
928                                                loTriFactorT->csrMat->values->data().get(),
929                                                loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
930                                                CUSPARSE_ACTION_NUMERIC,indexBase,
931                                                CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize));
932   PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize));
933 #endif
934 
935   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
936   PetscCallCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
937                                   loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
938                                   loTriFactor->csrMat->values->data().get(),
939                                   loTriFactor->csrMat->row_offsets->data().get(),
940                                   loTriFactor->csrMat->column_indices->data().get(),
941                                   loTriFactorT->csrMat->values->data().get(),
942                                   #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
943                                   loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
944                                   CUSPARSE_ACTION_NUMERIC, indexBase,
945                                   CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer));
946                                   #else
947                                   loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
948                                   CUSPARSE_ACTION_NUMERIC, indexBase));
949                                   #endif
950   PetscCallCUDA(WaitForCUDA());
951   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
952 
953   /* Create the solve analysis information */
954   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
955   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactorT->solveInfo));
956 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
957   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
958                                          loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
959                                          loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
960                                          loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
961                                          &loTriFactorT->solveBufferSize));
962   PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize));
963 #endif
964 
965   /* perform the solve analysis */
966   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
967                                    loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
968                                    loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
969                                    loTriFactorT->csrMat->column_indices->data().get(),
970                                    #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
971                                    loTriFactorT->solveInfo,
972                                    loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
973                                    #else
974                                    loTriFactorT->solveInfo));
975                                    #endif
976   PetscCallCUDA(WaitForCUDA());
977   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
978 
979   /* assign the pointer */
980   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
981 
982   /*********************************************/
983   /* Now the Transpose of the Upper Tri Factor */
984   /*********************************************/
985 
986   /* allocate space for the transpose of the upper triangular factor */
987   PetscCall(PetscNew(&upTriFactorT));
988   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
989 
990   /* set the matrix descriptors of the upper triangular factor */
991   matrixType = cusparseGetMatType(upTriFactor->descr);
992   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
993   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
994     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
995   diagType = cusparseGetMatDiagType(upTriFactor->descr);
996 
997   /* Create the matrix description */
998   PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr));
999   PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase));
1000   PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType));
1001   PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode));
1002   PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType));
1003 
1004   /* set the operation */
1005   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1006 
1007   /* allocate GPU space for the CSC of the upper triangular factor*/
1008   upTriFactorT->csrMat = new CsrMatrix;
1009   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1010   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1011   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1012   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1013   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1014   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1015 
1016   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1017 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1018   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1019                                                upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1020                                                upTriFactor->csrMat->values->data().get(),
1021                                                upTriFactor->csrMat->row_offsets->data().get(),
1022                                                upTriFactor->csrMat->column_indices->data().get(),
1023                                                upTriFactorT->csrMat->values->data().get(),
1024                                                upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1025                                                CUSPARSE_ACTION_NUMERIC,indexBase,
1026                                                CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize));
1027   PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize));
1028 #endif
1029 
1030   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
1031   PetscCallCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1032                                   upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1033                                   upTriFactor->csrMat->values->data().get(),
1034                                   upTriFactor->csrMat->row_offsets->data().get(),
1035                                   upTriFactor->csrMat->column_indices->data().get(),
1036                                   upTriFactorT->csrMat->values->data().get(),
1037                                   #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1038                                   upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1039                                   CUSPARSE_ACTION_NUMERIC, indexBase,
1040                                   CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer));
1041                                   #else
1042                                   upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1043                                  CUSPARSE_ACTION_NUMERIC, indexBase));
1044                                  #endif
1045 
1046   PetscCallCUDA(WaitForCUDA());
1047   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
1048 
1049   /* Create the solve analysis information */
1050   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
1051   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactorT->solveInfo));
1052   #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1053   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1054                                          upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1055                                          upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1056                                          upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
1057                                          &upTriFactorT->solveBufferSize));
1058   PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize));
1059   #endif
1060 
1061   /* perform the solve analysis */
1062   /* christ, would it have killed you to put this stuff in a function????????? */
1063   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1064                                    upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1065                                    upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1066                                    upTriFactorT->csrMat->column_indices->data().get(),
1067                                    #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1068                                    upTriFactorT->solveInfo,
1069                                    upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1070                                    #else
1071                                    upTriFactorT->solveInfo));
1072                                    #endif
1073 
1074   PetscCallCUDA(WaitForCUDA());
1075   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
1076 
1077   /* assign the pointer */
1078   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1079   PetscFunctionReturn(0);
1080 }
1081 
1082 struct PetscScalarToPetscInt
1083 {
1084   __host__ __device__
1085   PetscInt operator()(PetscScalar s)
1086   {
1087     return (PetscInt)PetscRealPart(s);
1088   }
1089 };
1090 
1091 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A)
1092 {
1093   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1094   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1095   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1096   cusparseStatus_t             stat;
1097   cusparseIndexBase_t          indexBase;
1098 
1099   PetscFunctionBegin;
1100   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1101   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1102   PetscCheck(matstruct,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing mat struct");
1103   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1104   PetscCheck(!A->transupdated || matstructT,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing matTranspose struct");
1105   if (A->transupdated) PetscFunctionReturn(0);
1106   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
1107   PetscCall(PetscLogGpuTimeBegin());
1108   if (cusparsestruct->format != MAT_CUSPARSE_CSR) {
1109     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE));
1110   }
1111   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1112     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1113     PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr));
1114     indexBase = cusparseGetMatIndexBase(matstruct->descr);
1115     PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase));
1116     PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
1117 
1118     /* set alpha and beta */
1119     PetscCallCUDA(cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar)));
1120     PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar)));
1121     PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar)));
1122     PetscCallCUDA(cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
1123     PetscCallCUDA(cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice));
1124     PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
1125 
1126     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1127       CsrMatrix *matrixT = new CsrMatrix;
1128       matstructT->mat = matrixT;
1129       matrixT->num_rows = A->cmap->n;
1130       matrixT->num_cols = A->rmap->n;
1131       matrixT->num_entries = a->nz;
1132       matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1133       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1134       matrixT->values = new THRUSTARRAY(a->nz);
1135 
1136       if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); }
1137       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
1138 
1139      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1140       #if PETSC_PKG_CUDA_VERSION_GE(11,2,1)
1141         stat = cusparseCreateCsr(&matstructT->matDescr,
1142                                matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1143                                matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1144                                matrixT->values->data().get(),
1145                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1146                                indexBase,cusparse_scalartype);PetscCallCUSPARSE(stat);
1147       #else
1148         /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1,
1149            see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1
1150 
1151            I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set
1152            it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2,
1153            when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly.
1154         */
1155         if (matrixT->num_entries) {
1156           stat = cusparseCreateCsr(&matstructT->matDescr,
1157                                  matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1158                                  matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1159                                  matrixT->values->data().get(),
1160                                  CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I,
1161                                  indexBase,cusparse_scalartype);PetscCallCUSPARSE(stat);
1162 
1163         } else {
1164           matstructT->matDescr = NULL;
1165           matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
1166         }
1167       #endif
1168      #endif
1169     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1170    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1171       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1172    #else
1173       CsrMatrix *temp  = new CsrMatrix;
1174       CsrMatrix *tempT = new CsrMatrix;
1175       /* First convert HYB to CSR */
1176       temp->num_rows = A->rmap->n;
1177       temp->num_cols = A->cmap->n;
1178       temp->num_entries = a->nz;
1179       temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1180       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1181       temp->values = new THRUSTARRAY(a->nz);
1182 
1183       stat = cusparse_hyb2csr(cusparsestruct->handle,
1184                               matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1185                               temp->values->data().get(),
1186                               temp->row_offsets->data().get(),
1187                               temp->column_indices->data().get());PetscCallCUSPARSE(stat);
1188 
1189       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1190       tempT->num_rows = A->rmap->n;
1191       tempT->num_cols = A->cmap->n;
1192       tempT->num_entries = a->nz;
1193       tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1194       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1195       tempT->values = new THRUSTARRAY(a->nz);
1196 
1197       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1198                               temp->num_cols, temp->num_entries,
1199                               temp->values->data().get(),
1200                               temp->row_offsets->data().get(),
1201                               temp->column_indices->data().get(),
1202                               tempT->values->data().get(),
1203                               tempT->column_indices->data().get(),
1204                               tempT->row_offsets->data().get(),
1205                               CUSPARSE_ACTION_NUMERIC, indexBase);PetscCallCUSPARSE(stat);
1206 
1207       /* Last, convert CSC to HYB */
1208       cusparseHybMat_t hybMat;
1209       PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
1210       cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1211         CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1212       stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1213                               matstructT->descr, tempT->values->data().get(),
1214                               tempT->row_offsets->data().get(),
1215                               tempT->column_indices->data().get(),
1216                               hybMat, 0, partition);PetscCallCUSPARSE(stat);
1217 
1218       /* assign the pointer */
1219       matstructT->mat = hybMat;
1220       A->transupdated = PETSC_TRUE;
1221       /* delete temporaries */
1222       if (tempT) {
1223         if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1224         if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1225         if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1226         delete (CsrMatrix*) tempT;
1227       }
1228       if (temp) {
1229         if (temp->values) delete (THRUSTARRAY*) temp->values;
1230         if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1231         if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1232         delete (CsrMatrix*) temp;
1233       }
1234      #endif
1235     }
1236   }
1237   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1238     CsrMatrix *matrix  = (CsrMatrix*)matstruct->mat;
1239     CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat;
1240     PetscCheck(matrix,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix");
1241     PetscCheck(matrix->row_offsets,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix rows");
1242     PetscCheck(matrix->column_indices,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix cols");
1243     PetscCheck(matrix->values,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix values");
1244     PetscCheck(matrixT,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT");
1245     PetscCheck(matrixT->row_offsets,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT rows");
1246     PetscCheck(matrixT->column_indices,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT cols");
1247     PetscCheck(matrixT->values,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT values");
1248     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1249       cusparsestruct->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
1250       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
1251       PetscCall(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt)));
1252     }
1253     if (!cusparsestruct->csr2csc_i) {
1254       THRUSTARRAY csr2csc_a(matrix->num_entries);
1255       PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1256 
1257       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1258      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1259       void   *csr2cscBuffer;
1260       size_t csr2cscBufferSize;
1261       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1262                                            A->cmap->n, matrix->num_entries,
1263                                            matrix->values->data().get(),
1264                                            cusparsestruct->rowoffsets_gpu->data().get(),
1265                                            matrix->column_indices->data().get(),
1266                                            matrixT->values->data().get(),
1267                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1268                                            CUSPARSE_ACTION_NUMERIC,indexBase,
1269                                            cusparsestruct->csr2cscAlg, &csr2cscBufferSize);PetscCallCUSPARSE(stat);
1270       PetscCallCUDA(cudaMalloc(&csr2cscBuffer,csr2cscBufferSize));
1271      #endif
1272 
1273       if (matrix->num_entries) {
1274         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
1275            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
1276            I checked every parameters and they were just fine. I have no clue why cusparse complains.
1277 
1278            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
1279            should be filled with indexBase. So I just take a shortcut here.
1280         */
1281         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1282                               A->cmap->n,matrix->num_entries,
1283                               csr2csc_a.data().get(),
1284                               cusparsestruct->rowoffsets_gpu->data().get(),
1285                               matrix->column_indices->data().get(),
1286                               matrixT->values->data().get(),
1287                              #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1288                               matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1289                               CUSPARSE_ACTION_NUMERIC,indexBase,
1290                               cusparsestruct->csr2cscAlg, csr2cscBuffer);PetscCallCUSPARSE(stat);
1291                              #else
1292                               matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1293                               CUSPARSE_ACTION_NUMERIC, indexBase);PetscCallCUSPARSE(stat);
1294                              #endif
1295       } else {
1296         matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
1297       }
1298 
1299       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1300       PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt()));
1301      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1302       PetscCallCUDA(cudaFree(csr2cscBuffer));
1303      #endif
1304     }
1305     PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()),
1306                                                      thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()),
1307                                                      matrixT->values->begin()));
1308   }
1309   PetscCall(PetscLogGpuTimeEnd());
1310   PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
1311   /* the compressed row indices is not used for matTranspose */
1312   matstructT->cprowIndices = NULL;
1313   /* assign the pointer */
1314   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1315   A->transupdated = PETSC_TRUE;
1316   PetscFunctionReturn(0);
1317 }
1318 
1319 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1320 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1321 {
1322   PetscInt                              n = xx->map->n;
1323   const PetscScalar                     *barray;
1324   PetscScalar                           *xarray;
1325   thrust::device_ptr<const PetscScalar> bGPU;
1326   thrust::device_ptr<PetscScalar>       xGPU;
1327   cusparseStatus_t                      stat;
1328   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1329   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1330   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1331   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1332 
1333   PetscFunctionBegin;
1334   /* Analyze the matrix and create the transpose ... on the fly */
1335   if (!loTriFactorT && !upTriFactorT) {
1336     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1337     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1338     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1339   }
1340 
1341   /* Get the GPU pointers */
1342   PetscCall(VecCUDAGetArrayWrite(xx,&xarray));
1343   PetscCall(VecCUDAGetArrayRead(bb,&barray));
1344   xGPU = thrust::device_pointer_cast(xarray);
1345   bGPU = thrust::device_pointer_cast(barray);
1346 
1347   PetscCall(PetscLogGpuTimeBegin());
1348   /* First, reorder with the row permutation */
1349   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1350                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1351                xGPU);
1352 
1353   /* First, solve U */
1354   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1355                         upTriFactorT->csrMat->num_rows,
1356                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1357                         upTriFactorT->csrMat->num_entries,
1358                       #endif
1359                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1360                         upTriFactorT->csrMat->values->data().get(),
1361                         upTriFactorT->csrMat->row_offsets->data().get(),
1362                         upTriFactorT->csrMat->column_indices->data().get(),
1363                         upTriFactorT->solveInfo,
1364                         xarray,
1365                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1366                         tempGPU->data().get(),
1367                         upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);PetscCallCUSPARSE(stat);
1368                       #else
1369                         tempGPU->data().get());PetscCallCUSPARSE(stat);
1370                       #endif
1371 
1372   /* Then, solve L */
1373   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1374                         loTriFactorT->csrMat->num_rows,
1375                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1376                         loTriFactorT->csrMat->num_entries,
1377                       #endif
1378                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1379                         loTriFactorT->csrMat->values->data().get(),
1380                         loTriFactorT->csrMat->row_offsets->data().get(),
1381                         loTriFactorT->csrMat->column_indices->data().get(),
1382                         loTriFactorT->solveInfo,
1383                         tempGPU->data().get(),
1384                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1385                         xarray,
1386                         loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);PetscCallCUSPARSE(stat);
1387                       #else
1388                          xarray);PetscCallCUSPARSE(stat);
1389                       #endif
1390 
1391   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1392   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1393                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1394                tempGPU->begin());
1395 
1396   /* Copy the temporary to the full solution. */
1397   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU);
1398 
1399   /* restore */
1400   PetscCall(VecCUDARestoreArrayRead(bb,&barray));
1401   PetscCall(VecCUDARestoreArrayWrite(xx,&xarray));
1402   PetscCall(PetscLogGpuTimeEnd());
1403   PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n));
1404   PetscFunctionReturn(0);
1405 }
1406 
1407 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1408 {
1409   const PetscScalar                 *barray;
1410   PetscScalar                       *xarray;
1411   cusparseStatus_t                  stat;
1412   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1413   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1414   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1415   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1416 
1417   PetscFunctionBegin;
1418   /* Analyze the matrix and create the transpose ... on the fly */
1419   if (!loTriFactorT && !upTriFactorT) {
1420     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1421     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1422     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1423   }
1424 
1425   /* Get the GPU pointers */
1426   PetscCall(VecCUDAGetArrayWrite(xx,&xarray));
1427   PetscCall(VecCUDAGetArrayRead(bb,&barray));
1428 
1429   PetscCall(PetscLogGpuTimeBegin());
1430   /* First, solve U */
1431   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1432                         upTriFactorT->csrMat->num_rows,
1433                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1434                         upTriFactorT->csrMat->num_entries,
1435                       #endif
1436                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1437                         upTriFactorT->csrMat->values->data().get(),
1438                         upTriFactorT->csrMat->row_offsets->data().get(),
1439                         upTriFactorT->csrMat->column_indices->data().get(),
1440                         upTriFactorT->solveInfo,
1441                         barray,
1442                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1443                         tempGPU->data().get(),
1444                         upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);PetscCallCUSPARSE(stat);
1445                       #else
1446                         tempGPU->data().get());PetscCallCUSPARSE(stat);
1447                       #endif
1448 
1449   /* Then, solve L */
1450   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1451                         loTriFactorT->csrMat->num_rows,
1452                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1453                         loTriFactorT->csrMat->num_entries,
1454                       #endif
1455                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1456                         loTriFactorT->csrMat->values->data().get(),
1457                         loTriFactorT->csrMat->row_offsets->data().get(),
1458                         loTriFactorT->csrMat->column_indices->data().get(),
1459                         loTriFactorT->solveInfo,
1460                         tempGPU->data().get(),
1461                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1462                         xarray,
1463                         loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);PetscCallCUSPARSE(stat);
1464                       #else
1465                         xarray);PetscCallCUSPARSE(stat);
1466                       #endif
1467 
1468   /* restore */
1469   PetscCall(VecCUDARestoreArrayRead(bb,&barray));
1470   PetscCall(VecCUDARestoreArrayWrite(xx,&xarray));
1471   PetscCall(PetscLogGpuTimeEnd());
1472   PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n));
1473   PetscFunctionReturn(0);
1474 }
1475 
1476 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1477 {
1478   const PetscScalar                     *barray;
1479   PetscScalar                           *xarray;
1480   thrust::device_ptr<const PetscScalar> bGPU;
1481   thrust::device_ptr<PetscScalar>       xGPU;
1482   cusparseStatus_t                      stat;
1483   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1484   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1485   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1486   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1487 
1488   PetscFunctionBegin;
1489 
1490   /* Get the GPU pointers */
1491   PetscCall(VecCUDAGetArrayWrite(xx,&xarray));
1492   PetscCall(VecCUDAGetArrayRead(bb,&barray));
1493   xGPU = thrust::device_pointer_cast(xarray);
1494   bGPU = thrust::device_pointer_cast(barray);
1495 
1496   PetscCall(PetscLogGpuTimeBegin());
1497   /* First, reorder with the row permutation */
1498   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1499                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1500                tempGPU->begin());
1501 
1502   /* Next, solve L */
1503   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1504                         loTriFactor->csrMat->num_rows,
1505                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1506                         loTriFactor->csrMat->num_entries,
1507                       #endif
1508                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1509                         loTriFactor->csrMat->values->data().get(),
1510                         loTriFactor->csrMat->row_offsets->data().get(),
1511                         loTriFactor->csrMat->column_indices->data().get(),
1512                         loTriFactor->solveInfo,
1513                         tempGPU->data().get(),
1514                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1515                          xarray,
1516                          loTriFactor->solvePolicy, loTriFactor->solveBuffer);PetscCallCUSPARSE(stat);
1517                       #else
1518                          xarray);PetscCallCUSPARSE(stat);
1519                       #endif
1520 
1521   /* Then, solve U */
1522   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1523                         upTriFactor->csrMat->num_rows,
1524                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1525                         upTriFactor->csrMat->num_entries,
1526                       #endif
1527                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1528                         upTriFactor->csrMat->values->data().get(),
1529                         upTriFactor->csrMat->row_offsets->data().get(),
1530                         upTriFactor->csrMat->column_indices->data().get(),
1531                         upTriFactor->solveInfo,xarray,
1532                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1533                         tempGPU->data().get(),
1534                         upTriFactor->solvePolicy, upTriFactor->solveBuffer);PetscCallCUSPARSE(stat);
1535                       #else
1536                         tempGPU->data().get());PetscCallCUSPARSE(stat);
1537                       #endif
1538 
1539   /* Last, reorder with the column permutation */
1540   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1541                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
1542                xGPU);
1543 
1544   PetscCall(VecCUDARestoreArrayRead(bb,&barray));
1545   PetscCall(VecCUDARestoreArrayWrite(xx,&xarray));
1546   PetscCall(PetscLogGpuTimeEnd());
1547   PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n));
1548   PetscFunctionReturn(0);
1549 }
1550 
1551 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1552 {
1553   const PetscScalar                 *barray;
1554   PetscScalar                       *xarray;
1555   cusparseStatus_t                  stat;
1556   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1557   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1558   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1559   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1560 
1561   PetscFunctionBegin;
1562   /* Get the GPU pointers */
1563   PetscCall(VecCUDAGetArrayWrite(xx,&xarray));
1564   PetscCall(VecCUDAGetArrayRead(bb,&barray));
1565 
1566   PetscCall(PetscLogGpuTimeBegin());
1567   /* First, solve L */
1568   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1569                         loTriFactor->csrMat->num_rows,
1570                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1571                         loTriFactor->csrMat->num_entries,
1572                       #endif
1573                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1574                         loTriFactor->csrMat->values->data().get(),
1575                         loTriFactor->csrMat->row_offsets->data().get(),
1576                         loTriFactor->csrMat->column_indices->data().get(),
1577                         loTriFactor->solveInfo,
1578                         barray,
1579                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1580                         tempGPU->data().get(),
1581                         loTriFactor->solvePolicy,loTriFactor->solveBuffer);PetscCallCUSPARSE(stat);
1582                       #else
1583                         tempGPU->data().get());PetscCallCUSPARSE(stat);
1584                       #endif
1585 
1586   /* Next, solve U */
1587   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1588                         upTriFactor->csrMat->num_rows,
1589                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1590                         upTriFactor->csrMat->num_entries,
1591                       #endif
1592                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1593                         upTriFactor->csrMat->values->data().get(),
1594                         upTriFactor->csrMat->row_offsets->data().get(),
1595                         upTriFactor->csrMat->column_indices->data().get(),
1596                         upTriFactor->solveInfo,
1597                         tempGPU->data().get(),
1598                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1599                         xarray,
1600                         upTriFactor->solvePolicy, upTriFactor->solveBuffer);PetscCallCUSPARSE(stat);
1601                       #else
1602                         xarray);PetscCallCUSPARSE(stat);
1603                       #endif
1604 
1605   PetscCall(VecCUDARestoreArrayRead(bb,&barray));
1606   PetscCall(VecCUDARestoreArrayWrite(xx,&xarray));
1607   PetscCall(PetscLogGpuTimeEnd());
1608   PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n));
1609   PetscFunctionReturn(0);
1610 }
1611 
1612 #if CUSPARSE_VERSION >= 11500
1613 /* cusparseSpSV_solve() and friends first appeared in cusparse-11.3 */
1614 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ILU0(Mat fact,Vec b,Vec x)
1615 {
1616   Mat_SeqAIJCUSPARSETriFactors  *fs = (Mat_SeqAIJCUSPARSETriFactors*)fact->spptr;
1617   Mat_SeqAIJ                    *aij = (Mat_SeqAIJ*)fact->data;
1618   const PetscScalar             *barray;
1619   PetscScalar                   *xarray;
1620 
1621   PetscFunctionBegin;
1622   PetscCall(VecCUDAGetArrayWrite(x,&xarray));
1623   PetscCall(VecCUDAGetArrayRead(b,&barray));
1624   PetscCall(PetscLogGpuTimeBegin());
1625 
1626   /* Solve L*y = b */
1627   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X,(void*)barray));
1628   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y,fs->Y));
1629   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle,
1630                     CUSPARSE_OPERATION_NON_TRANSPOSE,
1631                     &PETSC_CUSPARSE_ONE,
1632                     fs->spMatDescr_L, /* L Y = X */
1633                     fs->dnVecDescr_X,
1634                     fs->dnVecDescr_Y,
1635                     cusparse_scalartype,
1636                     CUSPARSE_SPSV_ALG_DEFAULT,
1637                     fs->spsvDescr_L));
1638 
1639   /* Solve U*x = y */
1640   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X,xarray));
1641   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle,
1642                     CUSPARSE_OPERATION_NON_TRANSPOSE,
1643                     &PETSC_CUSPARSE_ONE,
1644                     fs->spMatDescr_U, /* U X = Y */
1645                     fs->dnVecDescr_Y,
1646                     fs->dnVecDescr_X,
1647                     cusparse_scalartype,
1648                     CUSPARSE_SPSV_ALG_DEFAULT,
1649                     fs->spsvDescr_U));
1650 
1651   PetscCall(VecCUDARestoreArrayRead(b,&barray));
1652   PetscCall(VecCUDARestoreArrayWrite(x,&xarray));
1653 
1654   PetscCall(PetscLogGpuTimeEnd());
1655   PetscCall(PetscLogGpuFlops(2.0*aij->nz - fact->rmap->n));
1656   PetscFunctionReturn(0);
1657 }
1658 
1659 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_ILU0(Mat fact,Vec b,Vec x)
1660 {
1661   Mat_SeqAIJCUSPARSETriFactors  *fs = (Mat_SeqAIJCUSPARSETriFactors*)fact->spptr;
1662   Mat_SeqAIJ                    *aij = (Mat_SeqAIJ*)fact->data;
1663   const PetscScalar             *barray;
1664   PetscScalar                   *xarray;
1665 
1666   PetscFunctionBegin;
1667   if (!fs->builtSolveTranspose) { /* Call MatSolveTranspose() for the first time */
1668     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
1669     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle,
1670                       CUSPARSE_OPERATION_TRANSPOSE,
1671                       &PETSC_CUSPARSE_ONE,
1672                       fs->spMatDescr_L, /* The matrix is still L. We only do tranpose solve with it */
1673                       fs->dnVecDescr_X,
1674                       fs->dnVecDescr_Y,
1675                       cusparse_scalartype,
1676                       CUSPARSE_SPSV_ALG_DEFAULT,
1677                       fs->spsvDescr_Lt,
1678                       &fs->spsvBufferSize_Lt));
1679 
1680     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut));
1681     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle,
1682                       CUSPARSE_OPERATION_TRANSPOSE,
1683                       &PETSC_CUSPARSE_ONE,
1684                       fs->spMatDescr_U,
1685                       fs->dnVecDescr_X,
1686                       fs->dnVecDescr_Y,
1687                       cusparse_scalartype,
1688                       CUSPARSE_SPSV_ALG_DEFAULT,
1689                       fs->spsvDescr_Ut,
1690                       &fs->spsvBufferSize_Ut));
1691 
1692     PetscCallCUDA(cudaMalloc((void**)&fs->spsvBuffer_Ut,fs->spsvBufferSize_Ut));
1693     PetscCallCUDA(cudaMalloc((void**)&fs->spsvBuffer_Lt,fs->spsvBufferSize_Lt));
1694 
1695     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle,
1696                     CUSPARSE_OPERATION_TRANSPOSE,
1697                     &PETSC_CUSPARSE_ONE,
1698                     fs->spMatDescr_L,
1699                     fs->dnVecDescr_X,
1700                     fs->dnVecDescr_Y,
1701                     cusparse_scalartype,
1702                     CUSPARSE_SPSV_ALG_DEFAULT,
1703                     fs->spsvDescr_Lt,
1704                     fs->spsvBuffer_Lt));
1705 
1706     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle,
1707                     CUSPARSE_OPERATION_TRANSPOSE,
1708                     &PETSC_CUSPARSE_ONE,
1709                     fs->spMatDescr_U,
1710                     fs->dnVecDescr_X,
1711                     fs->dnVecDescr_Y,
1712                     cusparse_scalartype,
1713                     CUSPARSE_SPSV_ALG_DEFAULT,
1714                     fs->spsvDescr_Ut,
1715                     fs->spsvBuffer_Ut));
1716     fs->builtSolveTranspose = PETSC_TRUE;
1717   }
1718 
1719   PetscCall(VecCUDAGetArrayWrite(x,&xarray));
1720   PetscCall(VecCUDAGetArrayRead(b,&barray));
1721   PetscCall(PetscLogGpuTimeBegin());
1722 
1723   /* Solve Ut*y = b */
1724   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X,(void*)barray));
1725   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y,fs->Y));
1726   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle,
1727                     CUSPARSE_OPERATION_TRANSPOSE,
1728                     &PETSC_CUSPARSE_ONE,
1729                     fs->spMatDescr_U, /* Ut Y = X */
1730                     fs->dnVecDescr_X,
1731                     fs->dnVecDescr_Y,
1732                     cusparse_scalartype,
1733                     CUSPARSE_SPSV_ALG_DEFAULT,
1734                     fs->spsvDescr_Ut));
1735 
1736   /* Solve Lt*x = y */
1737   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X,xarray));
1738   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle,
1739                     CUSPARSE_OPERATION_TRANSPOSE,
1740                     &PETSC_CUSPARSE_ONE,
1741                     fs->spMatDescr_L, /* Lt X = Y */
1742                     fs->dnVecDescr_Y,
1743                     fs->dnVecDescr_X,
1744                     cusparse_scalartype,
1745                     CUSPARSE_SPSV_ALG_DEFAULT,
1746                     fs->spsvDescr_Lt));
1747 
1748   PetscCall(VecCUDARestoreArrayRead(b,&barray));
1749   PetscCall(VecCUDARestoreArrayWrite(x,&xarray));
1750   PetscCall(PetscLogGpuTimeEnd());
1751   PetscCall(PetscLogGpuFlops(2.0*aij->nz - fact->rmap->n));
1752   PetscFunctionReturn(0);
1753 }
1754 
1755 static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact,Mat A,const MatFactorInfo *info)
1756 {
1757   Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors*)fact->spptr;
1758   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ*)fact->data;
1759   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1760   CsrMatrix                    *Acsr;
1761   PetscInt                     m,nz;
1762   PetscBool                    flg;
1763 
1764   PetscFunctionBegin;
1765   if (PetscDefined(USE_DEBUG)) {
1766     PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
1767     PetscCheck(flg,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Expected MATSEQAIJCUSPARSE, but input is %s",((PetscObject)A)->type_name);
1768   }
1769 
1770   /* Copy A's value to fact */
1771   m  = fact->rmap->n;
1772   nz = aij->nz;
1773   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1774   Acsr = (CsrMatrix*)Acusp->mat->mat;
1775   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal,Acsr->values->data().get(),sizeof(PetscScalar)*nz,cudaMemcpyDeviceToDevice,PetscDefaultCudaStream));
1776 
1777   /* Factorize fact inplace */
1778   if (m) PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1779                     fs->matDescr_M,
1780                     fs->csrVal,
1781                     fs->csrRowPtr,
1782                     fs->csrColIdx,
1783                     fs->ilu0Info_M,
1784                     fs->policy_M,
1785                     fs->factBuffer_M));
1786   if (PetscDefined(USE_DEBUG)) {
1787     int              numerical_zero;
1788     cusparseStatus_t status;
1789     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero);
1790     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status,PETSC_COMM_SELF,PETSC_ERR_USER_INPUT,"Numerical zero pivot detected in csrilu02: A(%d,%d) is zero",numerical_zero,numerical_zero);
1791   }
1792 
1793   /* From my experiment, cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02() */
1794   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle,
1795                     CUSPARSE_OPERATION_NON_TRANSPOSE,
1796                     &PETSC_CUSPARSE_ONE,
1797                     fs->spMatDescr_L,
1798                     fs->dnVecDescr_X,
1799                     fs->dnVecDescr_Y,
1800                     cusparse_scalartype,
1801                     CUSPARSE_SPSV_ALG_DEFAULT,
1802                     fs->spsvDescr_L,
1803                     fs->spsvBuffer_L));
1804 
1805   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle,
1806                     CUSPARSE_OPERATION_NON_TRANSPOSE,
1807                     &PETSC_CUSPARSE_ONE,
1808                     fs->spMatDescr_U,
1809                     fs->dnVecDescr_X,
1810                     fs->dnVecDescr_Y,
1811                     cusparse_scalartype,
1812                     CUSPARSE_SPSV_ALG_DEFAULT,
1813                     fs->spsvDescr_U,
1814                     fs->spsvBuffer_U));
1815 
1816   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1817   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_ILU0;
1818   fact->ops->solvetranspose    = MatSolveTranspose_SeqAIJCUSPARSE_ILU0;
1819   fact->ops->matsolve          = NULL;
1820   fact->ops->matsolvetranspose = NULL;
1821   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1822   PetscFunctionReturn(0);
1823 }
1824 
1825 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1826 {
1827   Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors*)fact->spptr;
1828   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ*)fact->data;
1829   PetscInt                     m,nz;
1830 
1831   PetscFunctionBegin;
1832   if (PetscDefined(USE_DEBUG)) {
1833     PetscInt  i;
1834     PetscBool flg,missing;
1835 
1836     PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
1837     PetscCheck(flg,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Expected MATSEQAIJCUSPARSE, but input is %s",((PetscObject)A)->type_name);
1838     PetscCheck(A->rmap->n == A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT,A->rmap->n,A->cmap->n);
1839     PetscCall(MatMissingDiagonal(A,&missing,&i));
1840     PetscCheck(!missing,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %" PetscInt_FMT,i);
1841   }
1842 
1843   /* Free the old stale stuff */
1844   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
1845 
1846   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1847      but they will not be used. Allocate them just for easy debugging.
1848    */
1849   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE/*malloc*/));
1850 
1851   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
1852   fact->factortype             = MAT_FACTOR_ILU;
1853   fact->info.factor_mallocs    = 0;
1854   fact->info.fill_ratio_given  = info->fill;
1855   fact->info.fill_ratio_needed = 1.0;
1856 
1857   aij->row = NULL;
1858   aij->col = NULL;
1859 
1860   /* ====================================================================== */
1861   /* Copy A's i, j to fact and also allocate the value array of fact.       */
1862   /* We'll do in-place factorization on fact                                */
1863   /* ====================================================================== */
1864   const int *Ai,*Aj;
1865 
1866   m  = fact->rmap->n;
1867   nz = aij->nz;
1868 
1869   PetscCallCUDA(cudaMalloc((void**)&fs->csrRowPtr,sizeof(int)*(m+1)));
1870   PetscCallCUDA(cudaMalloc((void**)&fs->csrColIdx,sizeof(int)*nz));
1871   PetscCallCUDA(cudaMalloc((void**)&fs->csrVal,sizeof(PetscScalar)*nz));
1872   PetscCall(MatSeqAIJCUSPARSEGetIJ(A,PETSC_FALSE,&Ai,&Aj));  /* Do not use compressed Ai */
1873   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr,Ai,sizeof(int)*(m+1),cudaMemcpyDeviceToDevice,PetscDefaultCudaStream));
1874   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx,Aj,sizeof(int)*nz,cudaMemcpyDeviceToDevice,PetscDefaultCudaStream));
1875 
1876   /* ====================================================================== */
1877   /* Create descriptors for M, L, U                                         */
1878   /* ====================================================================== */
1879   cusparseFillMode_t fillMode;
1880   cusparseDiagType_t diagType;
1881 
1882   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
1883   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
1884   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
1885 
1886   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
1887     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1888     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1889     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1890     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1891   */
1892   fillMode = CUSPARSE_FILL_MODE_LOWER;
1893   diagType = CUSPARSE_DIAG_TYPE_UNIT;
1894   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L,m,m,nz,
1895                     fs->csrRowPtr,
1896                     fs->csrColIdx,
1897                     fs->csrVal,
1898                     CUSPARSE_INDEX_32I,
1899                     CUSPARSE_INDEX_32I,
1900                     CUSPARSE_INDEX_BASE_ZERO,
1901                     cusparse_scalartype));
1902   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L,
1903                     CUSPARSE_SPMAT_FILL_MODE,
1904                     &fillMode,
1905                     sizeof(fillMode)));
1906   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L,
1907                     CUSPARSE_SPMAT_DIAG_TYPE,
1908                     &diagType,
1909                     sizeof(diagType)));
1910 
1911   fillMode = CUSPARSE_FILL_MODE_UPPER;
1912   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
1913   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U,m,m,nz,
1914                     fs->csrRowPtr,
1915                     fs->csrColIdx,
1916                     fs->csrVal,
1917                     CUSPARSE_INDEX_32I,
1918                     CUSPARSE_INDEX_32I,
1919                     CUSPARSE_INDEX_BASE_ZERO,
1920                     cusparse_scalartype));
1921   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U,
1922                     CUSPARSE_SPMAT_FILL_MODE,
1923                     &fillMode,
1924                     sizeof(fillMode)));
1925   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U,
1926                     CUSPARSE_SPMAT_DIAG_TYPE,
1927                     &diagType,
1928                     sizeof(diagType)));
1929 
1930   /* ========================================================================= */
1931   /* Query buffer sizes for csrilu0, SpSV and allocate buffers                 */
1932   /* ========================================================================= */
1933   PetscCallCUSPARSE(cusparseCreateCsrilu02Info(&fs->ilu0Info_M));
1934   if (m) PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1935                     fs->matDescr_M,
1936                     fs->csrVal,
1937                     fs->csrRowPtr,
1938                     fs->csrColIdx,
1939                     fs->ilu0Info_M,
1940                     &fs->factBufferSize_M));
1941 
1942   PetscCallCUDA(cudaMalloc((void**)&fs->X,sizeof(PetscScalar)*m));
1943   PetscCallCUDA(cudaMalloc((void**)&fs->Y,sizeof(PetscScalar)*m));
1944 
1945   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X,m,fs->X,cusparse_scalartype));
1946   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y,m,fs->Y,cusparse_scalartype));
1947 
1948   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
1949   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle,
1950                     CUSPARSE_OPERATION_NON_TRANSPOSE,
1951                     &PETSC_CUSPARSE_ONE,
1952                     fs->spMatDescr_L,
1953                     fs->dnVecDescr_X,
1954                     fs->dnVecDescr_Y,
1955                     cusparse_scalartype,
1956                     CUSPARSE_SPSV_ALG_DEFAULT,
1957                     fs->spsvDescr_L,
1958                     &fs->spsvBufferSize_L));
1959 
1960   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
1961   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle,
1962                     CUSPARSE_OPERATION_NON_TRANSPOSE,
1963                     &PETSC_CUSPARSE_ONE,
1964                     fs->spMatDescr_U,
1965                     fs->dnVecDescr_X,
1966                     fs->dnVecDescr_Y,
1967                     cusparse_scalartype,
1968                     CUSPARSE_SPSV_ALG_DEFAULT,
1969                     fs->spsvDescr_U,
1970                     &fs->spsvBufferSize_U));
1971 
1972   /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab,
1973      spsvBuffer_L and spsvBuffer_U can not be shared.
1974    */
1975   PetscCallCUDA(cudaMalloc((void**)&fs->spsvBuffer_U,fs->spsvBufferSize_U));
1976   PetscCallCUDA(cudaMalloc((void**)&fs->spsvBuffer_L,fs->spsvBufferSize_L));
1977   PetscCallCUDA(cudaMalloc((void**)&fs->factBuffer_M,fs->factBufferSize_M));
1978 
1979   /* ========================================================================== */
1980   /* Perform analysis of ilu0 on M, SpSv on L and U                             */
1981   /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/
1982   /* ========================================================================== */
1983   int              structural_zero;
1984   cusparseStatus_t status;
1985 
1986   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1987   if (m) PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1988                     fs->matDescr_M,
1989                     fs->csrVal,
1990                     fs->csrRowPtr,
1991                     fs->csrColIdx,
1992                     fs->ilu0Info_M,
1993                     fs->policy_M,
1994                     fs->factBuffer_M));
1995   if (PetscDefined(USE_DEBUG)) {
1996     /* Function cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
1997     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero);
1998     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status,PETSC_COMM_SELF,PETSC_ERR_USER_INPUT,"Structural zero pivot detected in csrilu02: A(%d,%d) is missing",structural_zero,structural_zero);
1999   }
2000 
2001   /* Estimate FLOPs of the numeric factorization */
2002  {
2003     Mat_SeqAIJ     *Aseq = (Mat_SeqAIJ*)A->data;
2004     PetscInt       *Ai,*Adiag,nzRow,nzLeft;
2005     PetscLogDouble flops = 0.0;
2006 
2007     PetscCall(MatMarkDiagonal_SeqAIJ(A));
2008     Ai    = Aseq->i;
2009     Adiag = Aseq->diag;
2010     for (PetscInt i=0; i<m; i++) {
2011       if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i+1]) { /* There are nonzeros left to the diagonal of row i */
2012         nzRow  = Ai[i+1] - Ai[i];
2013         nzLeft = Adiag[i] - Ai[i];
2014         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
2015           and include the eliminated one will be updated, which incurs a multiplication and an addition.
2016         */
2017         nzLeft = (nzRow-1)/2;
2018         flops += nzLeft*(2.0*nzRow-nzLeft+1);
2019       }
2020     }
2021     fs->numericFactFlops = flops;
2022   }
2023   fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0;
2024   PetscFunctionReturn(0);
2025 }
2026 
2027 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact,Vec b,Vec x)
2028 {
2029   Mat_SeqAIJCUSPARSETriFactors  *fs = (Mat_SeqAIJCUSPARSETriFactors*)fact->spptr;
2030   Mat_SeqAIJ                    *aij = (Mat_SeqAIJ*)fact->data;
2031   const PetscScalar             *barray;
2032   PetscScalar                   *xarray;
2033 
2034   PetscFunctionBegin;
2035   PetscCall(VecCUDAGetArrayWrite(x,&xarray));
2036   PetscCall(VecCUDAGetArrayRead(b,&barray));
2037   PetscCall(PetscLogGpuTimeBegin());
2038 
2039   /* Solve L*y = b */
2040   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X,(void*)barray));
2041   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y,fs->Y));
2042   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle,
2043                     CUSPARSE_OPERATION_NON_TRANSPOSE,
2044                     &PETSC_CUSPARSE_ONE,
2045                     fs->spMatDescr_L, /* L Y = X */
2046                     fs->dnVecDescr_X,
2047                     fs->dnVecDescr_Y,
2048                     cusparse_scalartype,
2049                     CUSPARSE_SPSV_ALG_DEFAULT,
2050                     fs->spsvDescr_L));
2051 
2052   /* Solve Lt*x = y */
2053   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X,xarray));
2054   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle,
2055                     CUSPARSE_OPERATION_TRANSPOSE,
2056                     &PETSC_CUSPARSE_ONE,
2057                     fs->spMatDescr_L, /* Lt X = Y */
2058                     fs->dnVecDescr_Y,
2059                     fs->dnVecDescr_X,
2060                     cusparse_scalartype,
2061                     CUSPARSE_SPSV_ALG_DEFAULT,
2062                     fs->spsvDescr_Lt));
2063 
2064   PetscCall(VecCUDARestoreArrayRead(b,&barray));
2065   PetscCall(VecCUDARestoreArrayWrite(x,&xarray));
2066 
2067   PetscCall(PetscLogGpuTimeEnd());
2068   PetscCall(PetscLogGpuFlops(2.0*aij->nz - fact->rmap->n));
2069   PetscFunctionReturn(0);
2070 }
2071 
2072 static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact,Mat A,const MatFactorInfo *info)
2073 {
2074   Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors*)fact->spptr;
2075   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ*)fact->data;
2076   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2077   CsrMatrix                    *Acsr;
2078   PetscInt                     m,nz;
2079   PetscBool                    flg;
2080 
2081   PetscFunctionBegin;
2082   if (PetscDefined(USE_DEBUG)) {
2083     PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
2084     PetscCheck(flg,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Expected MATSEQAIJCUSPARSE, but input is %s",((PetscObject)A)->type_name);
2085   }
2086 
2087   /* Copy A's value to fact */
2088   m  = fact->rmap->n;
2089   nz = aij->nz;
2090   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2091   Acsr = (CsrMatrix*)Acusp->mat->mat;
2092   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal,Acsr->values->data().get(),sizeof(PetscScalar)*nz,cudaMemcpyDeviceToDevice,PetscDefaultCudaStream));
2093 
2094   /* Factorize fact inplace */
2095   /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve
2096      Function csric02() only takes the lower triangular part of matrix A to perform factorization.
2097      The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored,
2098      and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not.
2099      In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided.
2100    */
2101   if (m) PetscCallCUSPARSE(cusparseXcsric02(fs->handle, m, nz,
2102                     fs->matDescr_M,
2103                     fs->csrVal,
2104                     fs->csrRowPtr,
2105                     fs->csrColIdx,
2106                     fs->ic0Info_M,
2107                     fs->policy_M,
2108                     fs->factBuffer_M));
2109   if (PetscDefined(USE_DEBUG)) {
2110     int              numerical_zero;
2111     cusparseStatus_t status;
2112     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero);
2113     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status,PETSC_COMM_SELF,PETSC_ERR_USER_INPUT,"Numerical zero pivot detected in csric02: A(%d,%d) is zero",numerical_zero,numerical_zero);
2114   }
2115 
2116   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle,
2117                     CUSPARSE_OPERATION_NON_TRANSPOSE,
2118                     &PETSC_CUSPARSE_ONE,
2119                     fs->spMatDescr_L,
2120                     fs->dnVecDescr_X,
2121                     fs->dnVecDescr_Y,
2122                     cusparse_scalartype,
2123                     CUSPARSE_SPSV_ALG_DEFAULT,
2124                     fs->spsvDescr_L,
2125                     fs->spsvBuffer_L));
2126 
2127   /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE
2128     ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F
2129   */
2130   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle,
2131                     CUSPARSE_OPERATION_TRANSPOSE,
2132                     &PETSC_CUSPARSE_ONE,
2133                     fs->spMatDescr_L,
2134                     fs->dnVecDescr_X,
2135                     fs->dnVecDescr_Y,
2136                     cusparse_scalartype,
2137                     CUSPARSE_SPSV_ALG_DEFAULT,
2138                     fs->spsvDescr_Lt,
2139                     fs->spsvBuffer_Lt));
2140 
2141   fact->offloadmask            = PETSC_OFFLOAD_GPU;
2142   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_ICC0;
2143   fact->ops->solvetranspose    = MatSolve_SeqAIJCUSPARSE_ICC0;
2144   fact->ops->matsolve          = NULL;
2145   fact->ops->matsolvetranspose = NULL;
2146   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
2147   PetscFunctionReturn(0);
2148 }
2149 
2150 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact,Mat A,IS perm,const MatFactorInfo *info)
2151 {
2152   Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors*)fact->spptr;
2153   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ*)fact->data;
2154   PetscInt                     m,nz;
2155 
2156   PetscFunctionBegin;
2157   if (PetscDefined(USE_DEBUG)) {
2158     PetscInt  i;
2159     PetscBool flg,missing;
2160 
2161     PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
2162     PetscCheck(flg,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Expected MATSEQAIJCUSPARSE, but input is %s",((PetscObject)A)->type_name);
2163     PetscCheck(A->rmap->n == A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT,A->rmap->n,A->cmap->n);
2164     PetscCall(MatMissingDiagonal(A,&missing,&i));
2165     PetscCheck(!missing,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %" PetscInt_FMT,i);
2166   }
2167 
2168   /* Free the old stale stuff */
2169   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
2170 
2171   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
2172      but they will not be used. Allocate them just for easy debugging.
2173    */
2174   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE/*malloc*/));
2175 
2176   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
2177   fact->factortype             = MAT_FACTOR_ICC;
2178   fact->info.factor_mallocs    = 0;
2179   fact->info.fill_ratio_given  = info->fill;
2180   fact->info.fill_ratio_needed = 1.0;
2181 
2182   aij->row = NULL;
2183   aij->col = NULL;
2184 
2185   /* ====================================================================== */
2186   /* Copy A's i, j to fact and also allocate the value array of fact.       */
2187   /* We'll do in-place factorization on fact                                */
2188   /* ====================================================================== */
2189   const int *Ai,*Aj;
2190 
2191   m  = fact->rmap->n;
2192   nz = aij->nz;
2193 
2194   PetscCallCUDA(cudaMalloc((void**)&fs->csrRowPtr,sizeof(int)*(m+1)));
2195   PetscCallCUDA(cudaMalloc((void**)&fs->csrColIdx,sizeof(int)*nz));
2196   PetscCallCUDA(cudaMalloc((void**)&fs->csrVal,sizeof(PetscScalar)*nz));
2197   PetscCall(MatSeqAIJCUSPARSEGetIJ(A,PETSC_FALSE,&Ai,&Aj));  /* Do not use compressed Ai */
2198   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr,Ai,sizeof(int)*(m+1),cudaMemcpyDeviceToDevice,PetscDefaultCudaStream));
2199   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx,Aj,sizeof(int)*nz,cudaMemcpyDeviceToDevice,PetscDefaultCudaStream));
2200 
2201   /* ====================================================================== */
2202   /* Create mat descriptors for M, L                                        */
2203   /* ====================================================================== */
2204   cusparseFillMode_t fillMode;
2205   cusparseDiagType_t diagType;
2206 
2207   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
2208   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
2209   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
2210 
2211   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
2212     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
2213     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
2214     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
2215     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
2216   */
2217   fillMode = CUSPARSE_FILL_MODE_LOWER;
2218   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
2219   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L,m,m,nz,
2220                     fs->csrRowPtr,
2221                     fs->csrColIdx,
2222                     fs->csrVal,
2223                     CUSPARSE_INDEX_32I,
2224                     CUSPARSE_INDEX_32I,
2225                     CUSPARSE_INDEX_BASE_ZERO,
2226                     cusparse_scalartype));
2227   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L,
2228                     CUSPARSE_SPMAT_FILL_MODE,
2229                     &fillMode,
2230                     sizeof(fillMode)));
2231   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L,
2232                     CUSPARSE_SPMAT_DIAG_TYPE,
2233                     &diagType,
2234                     sizeof(diagType)));
2235 
2236   /* ========================================================================= */
2237   /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers     */
2238   /* ========================================================================= */
2239   PetscCallCUSPARSE(cusparseCreateCsric02Info(&fs->ic0Info_M));
2240   if (m) PetscCallCUSPARSE(cusparseXcsric02_bufferSize(fs->handle, m, nz,
2241                     fs->matDescr_M,
2242                     fs->csrVal,
2243                     fs->csrRowPtr,
2244                     fs->csrColIdx,
2245                     fs->ic0Info_M,
2246                     &fs->factBufferSize_M));
2247 
2248   PetscCallCUDA(cudaMalloc((void**)&fs->X,sizeof(PetscScalar)*m));
2249   PetscCallCUDA(cudaMalloc((void**)&fs->Y,sizeof(PetscScalar)*m));
2250 
2251   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X,m,fs->X,cusparse_scalartype));
2252   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y,m,fs->Y,cusparse_scalartype));
2253 
2254   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
2255   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle,
2256                     CUSPARSE_OPERATION_NON_TRANSPOSE,
2257                     &PETSC_CUSPARSE_ONE,
2258                     fs->spMatDescr_L,
2259                     fs->dnVecDescr_X,
2260                     fs->dnVecDescr_Y,
2261                     cusparse_scalartype,
2262                     CUSPARSE_SPSV_ALG_DEFAULT,
2263                     fs->spsvDescr_L,
2264                     &fs->spsvBufferSize_L));
2265 
2266   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
2267   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle,
2268                     CUSPARSE_OPERATION_TRANSPOSE,
2269                     &PETSC_CUSPARSE_ONE,
2270                     fs->spMatDescr_L,
2271                     fs->dnVecDescr_X,
2272                     fs->dnVecDescr_Y,
2273                     cusparse_scalartype,
2274                     CUSPARSE_SPSV_ALG_DEFAULT,
2275                     fs->spsvDescr_Lt,
2276                     &fs->spsvBufferSize_Lt));
2277 
2278   PetscCallCUDA(cudaMalloc((void**)&fs->factBuffer_M,fs->factBufferSize_M));
2279   PetscCallCUDA(cudaMalloc((void**)&fs->spsvBuffer_L,fs->spsvBufferSize_L));
2280   PetscCallCUDA(cudaMalloc((void**)&fs->spsvBuffer_Lt,fs->spsvBufferSize_Lt));
2281 
2282   /* ========================================================================== */
2283   /* Perform analysis of ic0 on M                                               */
2284   /* The lower triangular part of M has the same sparsity pattern as L          */
2285   /* ========================================================================== */
2286   int              structural_zero;
2287   cusparseStatus_t status;
2288 
2289   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
2290   if (m) PetscCallCUSPARSE(cusparseXcsric02_analysis(fs->handle, m, nz,
2291                     fs->matDescr_M,
2292                     fs->csrVal,
2293                     fs->csrRowPtr,
2294                     fs->csrColIdx,
2295                     fs->ic0Info_M,
2296                     fs->policy_M,
2297                     fs->factBuffer_M));
2298   if (PetscDefined(USE_DEBUG)) {
2299     /* Function cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
2300     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero);
2301     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status,PETSC_COMM_SELF,PETSC_ERR_USER_INPUT,"Structural zero pivot detected in csric02: A(%d,%d) is missing",structural_zero,structural_zero);
2302   }
2303 
2304   /* Estimate FLOPs of the numeric factorization */
2305   {
2306     Mat_SeqAIJ     *Aseq = (Mat_SeqAIJ*)A->data;
2307     PetscInt       *Ai,nzRow,nzLeft;
2308     PetscLogDouble flops = 0.0;
2309 
2310     Ai = Aseq->i;
2311     for (PetscInt i=0; i<m; i++) {
2312       nzRow = Ai[i+1] - Ai[i];
2313       if (nzRow > 1) {
2314         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
2315           and include the eliminated one will be updated, which incurs a multiplication and an addition.
2316         */
2317         nzLeft = (nzRow-1)/2;
2318         flops += nzLeft*(2.0*nzRow-nzLeft+1);
2319       }
2320     }
2321     fs->numericFactFlops = flops;
2322   }
2323   fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0;
2324   PetscFunctionReturn(0);
2325 }
2326 #endif
2327 
2328 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
2329 {
2330   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
2331 
2332   PetscFunctionBegin;
2333  #if CUSPARSE_VERSION >= 11500
2334   PetscBool row_identity,col_identity;
2335   PetscCall(ISIdentity(isrow,&row_identity));
2336   PetscCall(ISIdentity(iscol,&col_identity));
2337   if (!info->levels && row_identity && col_identity) {
2338     PetscCall(MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B,A,isrow,iscol,info));
2339   } else
2340  #endif
2341   {
2342     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2343     PetscCall(MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info));
2344     B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2345   }
2346   PetscFunctionReturn(0);
2347 }
2348 
2349 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
2350 {
2351   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
2352 
2353   PetscFunctionBegin;
2354   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2355   PetscCall(MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info));
2356   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2357   PetscFunctionReturn(0);
2358 }
2359 
2360 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
2361 {
2362   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
2363 
2364   PetscFunctionBegin;
2365  #if CUSPARSE_VERSION >= 11500
2366   PetscBool perm_identity;
2367   PetscCall(ISIdentity(perm,&perm_identity));
2368   if (!info->levels && perm_identity) {
2369     PetscCall(MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B,A,perm,info));
2370   } else
2371  #endif
2372   {
2373     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2374     PetscCall(MatICCFactorSymbolic_SeqAIJ(B,A,perm,info));
2375     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
2376   }
2377   PetscFunctionReturn(0);
2378 }
2379 
2380 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
2381 {
2382   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
2383 
2384   PetscFunctionBegin;
2385   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2386   PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info));
2387   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
2388   PetscFunctionReturn(0);
2389 }
2390 
2391 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
2392 {
2393   PetscFunctionBegin;
2394   *type = MATSOLVERCUSPARSE;
2395   PetscFunctionReturn(0);
2396 }
2397 
2398 /*MC
2399   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
2400   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
2401   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
2402   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
2403   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
2404   algorithms are not recommended. This class does NOT support direct solver operations.
2405 
2406   Level: beginner
2407 
2408 .seealso: `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
2409 M*/
2410 
2411 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
2412 {
2413   PetscInt       n = A->rmap->n;
2414 
2415   PetscFunctionBegin;
2416   PetscCall(MatCreate(PetscObjectComm((PetscObject)A),B));
2417   PetscCall(MatSetSizes(*B,n,n,n,n));
2418   (*B)->factortype = ftype;
2419   PetscCall(MatSetType(*B,MATSEQAIJCUSPARSE));
2420 
2421   if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B,PETSC_TRUE));
2422   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
2423     PetscCall(MatSetBlockSizesFromMats(*B,A,A));
2424     if (!A->boundtocpu) {
2425       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
2426       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
2427     } else {
2428       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
2429       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJ;
2430     }
2431     PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]));
2432     PetscCall(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU]));
2433     PetscCall(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
2434   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
2435     if (!A->boundtocpu) {
2436       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
2437       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
2438     } else {
2439       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJ;
2440       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
2441     }
2442     PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
2443     PetscCall(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC]));
2444   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
2445 
2446   PetscCall(MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL));
2447   (*B)->canuseordering = PETSC_TRUE;
2448   PetscCall(PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse));
2449   PetscFunctionReturn(0);
2450 }
2451 
2452 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
2453 {
2454   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
2455   Mat_SeqAIJCUSPARSE           *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2456  #if CUSPARSE_VERSION >= 13500
2457   Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
2458  #endif
2459 
2460   PetscFunctionBegin;
2461   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
2462     PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0));
2463     if (A->factortype == MAT_FACTOR_NONE) {
2464       CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat;
2465       PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2466     }
2467    #if CUSPARSE_VERSION >= 13500
2468     else if (fs->csrVal) {
2469       /* We have a factorized matrix on device and are able to copy it to host */
2470       PetscCallCUDA(cudaMemcpy(a->a, fs->csrVal, a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2471     }
2472    #endif
2473     else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for copying this type of factorized matrix from device to host");
2474     PetscCall(PetscLogGpuToCpu(a->nz*sizeof(PetscScalar)));
2475     PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0));
2476     A->offloadmask = PETSC_OFFLOAD_BOTH;
2477   }
2478   PetscFunctionReturn(0);
2479 }
2480 
2481 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
2482 {
2483   PetscFunctionBegin;
2484   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2485   *array = ((Mat_SeqAIJ*)A->data)->a;
2486   PetscFunctionReturn(0);
2487 }
2488 
2489 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
2490 {
2491   PetscFunctionBegin;
2492   A->offloadmask = PETSC_OFFLOAD_CPU;
2493   *array         = NULL;
2494   PetscFunctionReturn(0);
2495 }
2496 
2497 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[])
2498 {
2499   PetscFunctionBegin;
2500   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2501   *array = ((Mat_SeqAIJ*)A->data)->a;
2502   PetscFunctionReturn(0);
2503 }
2504 
2505 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[])
2506 {
2507   PetscFunctionBegin;
2508   *array = NULL;
2509   PetscFunctionReturn(0);
2510 }
2511 
2512 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
2513 {
2514   PetscFunctionBegin;
2515   *array = ((Mat_SeqAIJ*)A->data)->a;
2516   PetscFunctionReturn(0);
2517 }
2518 
2519 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
2520 {
2521   PetscFunctionBegin;
2522   A->offloadmask = PETSC_OFFLOAD_CPU;
2523   *array         = NULL;
2524   PetscFunctionReturn(0);
2525 }
2526 
2527 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A,const PetscInt **i,const PetscInt **j,PetscScalar **a,PetscMemType *mtype)
2528 {
2529   Mat_SeqAIJCUSPARSE           *cusp;
2530   CsrMatrix                    *matrix;
2531 
2532   PetscFunctionBegin;
2533   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2534   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
2535   cusp = static_cast<Mat_SeqAIJCUSPARSE*>(A->spptr);
2536   PetscCheck(cusp != NULL,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"cusp is NULL");
2537   matrix = (CsrMatrix*)cusp->mat->mat;
2538 
2539   if (i) {
2540    #if !defined(PETSC_USE_64BIT_INDICES)
2541     *i = matrix->row_offsets->data().get();
2542    #else
2543     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSparse does not supported 64-bit indices");
2544    #endif
2545   }
2546   if (j) {
2547    #if !defined(PETSC_USE_64BIT_INDICES)
2548     *j = matrix->column_indices->data().get();
2549    #else
2550     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSparse does not supported 64-bit indices");
2551    #endif
2552   }
2553   if (a) *a = matrix->values->data().get();
2554   if (mtype) *mtype = PETSC_MEMTYPE_CUDA;
2555   PetscFunctionReturn(0);
2556 }
2557 
2558 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
2559 {
2560   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
2561   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
2562   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
2563   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
2564   cusparseStatus_t             stat;
2565   PetscBool                    both = PETSC_TRUE;
2566 
2567   PetscFunctionBegin;
2568   PetscCheck(!A->boundtocpu,PETSC_COMM_SELF,PETSC_ERR_GPU,"Cannot copy to GPU");
2569   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
2570     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
2571       CsrMatrix *matrix;
2572       matrix = (CsrMatrix*)cusparsestruct->mat->mat;
2573 
2574       PetscCheck(!a->nz || a->a,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR values");
2575       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0));
2576       matrix->values->assign(a->a, a->a+a->nz);
2577       PetscCallCUDA(WaitForCUDA());
2578       PetscCall(PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar)));
2579       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0));
2580       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE));
2581     } else {
2582       PetscInt nnz;
2583       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0));
2584       PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format));
2585       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE));
2586       delete cusparsestruct->workVector;
2587       delete cusparsestruct->rowoffsets_gpu;
2588       cusparsestruct->workVector = NULL;
2589       cusparsestruct->rowoffsets_gpu = NULL;
2590       try {
2591         if (a->compressedrow.use) {
2592           m    = a->compressedrow.nrows;
2593           ii   = a->compressedrow.i;
2594           ridx = a->compressedrow.rindex;
2595         } else {
2596           m    = A->rmap->n;
2597           ii   = a->i;
2598           ridx = NULL;
2599         }
2600         PetscCheck(ii,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR row data");
2601         if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; }
2602         else nnz = a->nz;
2603         PetscCheck(!nnz || a->j,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR column data");
2604 
2605         /* create cusparse matrix */
2606         cusparsestruct->nrows = m;
2607         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
2608         PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr));
2609         PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO));
2610         PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
2611 
2612         PetscCallCUDA(cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar)));
2613         PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar)));
2614         PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar)));
2615         PetscCallCUDA(cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
2616         PetscCallCUDA(cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice));
2617         PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
2618         PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE));
2619 
2620         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
2621         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
2622           /* set the matrix */
2623           CsrMatrix *mat= new CsrMatrix;
2624           mat->num_rows = m;
2625           mat->num_cols = A->cmap->n;
2626           mat->num_entries = nnz;
2627           mat->row_offsets = new THRUSTINTARRAY32(m+1);
2628           mat->row_offsets->assign(ii, ii + m+1);
2629 
2630           mat->column_indices = new THRUSTINTARRAY32(nnz);
2631           mat->column_indices->assign(a->j, a->j+nnz);
2632 
2633           mat->values = new THRUSTARRAY(nnz);
2634           if (a->a) mat->values->assign(a->a, a->a+nnz);
2635 
2636           /* assign the pointer */
2637           matstruct->mat = mat;
2638          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2639           if (mat->num_rows) { /* cusparse errors on empty matrices! */
2640             stat = cusparseCreateCsr(&matstruct->matDescr,
2641                                     mat->num_rows, mat->num_cols, mat->num_entries,
2642                                     mat->row_offsets->data().get(), mat->column_indices->data().get(),
2643                                     mat->values->data().get(),
2644                                     CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2645                                     CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);PetscCallCUSPARSE(stat);
2646           }
2647          #endif
2648         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
2649          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2650           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2651          #else
2652           CsrMatrix *mat= new CsrMatrix;
2653           mat->num_rows = m;
2654           mat->num_cols = A->cmap->n;
2655           mat->num_entries = nnz;
2656           mat->row_offsets = new THRUSTINTARRAY32(m+1);
2657           mat->row_offsets->assign(ii, ii + m+1);
2658 
2659           mat->column_indices = new THRUSTINTARRAY32(nnz);
2660           mat->column_indices->assign(a->j, a->j+nnz);
2661 
2662           mat->values = new THRUSTARRAY(nnz);
2663           if (a->a) mat->values->assign(a->a, a->a+nnz);
2664 
2665           cusparseHybMat_t hybMat;
2666           PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
2667           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
2668             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
2669           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
2670               matstruct->descr, mat->values->data().get(),
2671               mat->row_offsets->data().get(),
2672               mat->column_indices->data().get(),
2673               hybMat, 0, partition);PetscCallCUSPARSE(stat);
2674           /* assign the pointer */
2675           matstruct->mat = hybMat;
2676 
2677           if (mat) {
2678             if (mat->values) delete (THRUSTARRAY*)mat->values;
2679             if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
2680             if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
2681             delete (CsrMatrix*)mat;
2682           }
2683          #endif
2684         }
2685 
2686         /* assign the compressed row indices */
2687         if (a->compressedrow.use) {
2688           cusparsestruct->workVector = new THRUSTARRAY(m);
2689           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
2690           matstruct->cprowIndices->assign(ridx,ridx+m);
2691           tmp = m;
2692         } else {
2693           cusparsestruct->workVector = NULL;
2694           matstruct->cprowIndices    = NULL;
2695           tmp = 0;
2696         }
2697         PetscCall(PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar)));
2698 
2699         /* assign the pointer */
2700         cusparsestruct->mat = matstruct;
2701       } catch(char *ex) {
2702         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
2703       }
2704       PetscCallCUDA(WaitForCUDA());
2705       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0));
2706       cusparsestruct->nonzerostate = A->nonzerostate;
2707     }
2708     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
2709   }
2710   PetscFunctionReturn(0);
2711 }
2712 
2713 struct VecCUDAPlusEquals
2714 {
2715   template <typename Tuple>
2716   __host__ __device__
2717   void operator()(Tuple t)
2718   {
2719     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
2720   }
2721 };
2722 
2723 struct VecCUDAEquals
2724 {
2725   template <typename Tuple>
2726   __host__ __device__
2727   void operator()(Tuple t)
2728   {
2729     thrust::get<1>(t) = thrust::get<0>(t);
2730   }
2731 };
2732 
2733 struct VecCUDAEqualsReverse
2734 {
2735   template <typename Tuple>
2736   __host__ __device__
2737   void operator()(Tuple t)
2738   {
2739     thrust::get<0>(t) = thrust::get<1>(t);
2740   }
2741 };
2742 
2743 struct MatMatCusparse {
2744   PetscBool             cisdense;
2745   PetscScalar           *Bt;
2746   Mat                   X;
2747   PetscBool             reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
2748   PetscLogDouble        flops;
2749   CsrMatrix             *Bcsr;
2750 
2751 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2752   cusparseSpMatDescr_t  matSpBDescr;
2753   PetscBool             initialized;   /* C = alpha op(A) op(B) + beta C */
2754   cusparseDnMatDescr_t  matBDescr;
2755   cusparseDnMatDescr_t  matCDescr;
2756   PetscInt              Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
2757  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2758   void                  *dBuffer4;
2759   void                  *dBuffer5;
2760  #endif
2761   size_t                mmBufferSize;
2762   void                  *mmBuffer;
2763   void                  *mmBuffer2; /* SpGEMM WorkEstimation buffer */
2764   cusparseSpGEMMDescr_t spgemmDesc;
2765 #endif
2766 };
2767 
2768 static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
2769 {
2770   MatMatCusparse *mmdata = (MatMatCusparse *)data;
2771 
2772   PetscFunctionBegin;
2773   PetscCallCUDA(cudaFree(mmdata->Bt));
2774   delete mmdata->Bcsr;
2775  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2776   if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr));
2777   if (mmdata->matBDescr)   PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
2778   if (mmdata->matCDescr)   PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
2779   if (mmdata->spgemmDesc)  PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc));
2780  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2781   if (mmdata->dBuffer4)  PetscCallCUDA(cudaFree(mmdata->dBuffer4));
2782   if (mmdata->dBuffer5)  PetscCallCUDA(cudaFree(mmdata->dBuffer5));
2783  #endif
2784   if (mmdata->mmBuffer)  PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2785   if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2));
2786  #endif
2787   PetscCall(MatDestroy(&mmdata->X));
2788   PetscCall(PetscFree(data));
2789   PetscFunctionReturn(0);
2790 }
2791 
2792 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
2793 
2794 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2795 {
2796   Mat_Product                  *product = C->product;
2797   Mat                          A,B;
2798   PetscInt                     m,n,blda,clda;
2799   PetscBool                    flg,biscuda;
2800   Mat_SeqAIJCUSPARSE           *cusp;
2801   cusparseStatus_t             stat;
2802   cusparseOperation_t          opA;
2803   const PetscScalar            *barray;
2804   PetscScalar                  *carray;
2805   MatMatCusparse               *mmdata;
2806   Mat_SeqAIJCUSPARSEMultStruct *mat;
2807   CsrMatrix                    *csrmat;
2808 
2809   PetscFunctionBegin;
2810   MatCheckProduct(C,1);
2811   PetscCheck(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
2812   mmdata = (MatMatCusparse*)product->data;
2813   A    = product->A;
2814   B    = product->B;
2815   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
2816   PetscCheck(flg,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2817   /* currently CopyToGpu does not copy if the matrix is bound to CPU
2818      Instead of silently accepting the wrong answer, I prefer to raise the error */
2819   PetscCheck(!A->boundtocpu,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2820   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2821   cusp   = (Mat_SeqAIJCUSPARSE*)A->spptr;
2822   switch (product->type) {
2823   case MATPRODUCT_AB:
2824   case MATPRODUCT_PtAP:
2825     mat = cusp->mat;
2826     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2827     m   = A->rmap->n;
2828     n   = B->cmap->n;
2829     break;
2830   case MATPRODUCT_AtB:
2831     if (!A->form_explicit_transpose) {
2832       mat = cusp->mat;
2833       opA = CUSPARSE_OPERATION_TRANSPOSE;
2834     } else {
2835       PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2836       mat  = cusp->matTranspose;
2837       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
2838     }
2839     m = A->cmap->n;
2840     n = B->cmap->n;
2841     break;
2842   case MATPRODUCT_ABt:
2843   case MATPRODUCT_RARt:
2844     mat = cusp->mat;
2845     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2846     m   = A->rmap->n;
2847     n   = B->rmap->n;
2848     break;
2849   default:
2850     SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2851   }
2852   PetscCheck(mat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing Mat_SeqAIJCUSPARSEMultStruct");
2853   csrmat = (CsrMatrix*)mat->mat;
2854   /* if the user passed a CPU matrix, copy the data to the GPU */
2855   PetscCall(PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda));
2856   if (!biscuda) PetscCall(MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B));
2857   PetscCall(MatDenseCUDAGetArrayRead(B,&barray));
2858 
2859   PetscCall(MatDenseGetLDA(B,&blda));
2860   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2861     PetscCall(MatDenseCUDAGetArrayWrite(mmdata->X,&carray));
2862     PetscCall(MatDenseGetLDA(mmdata->X,&clda));
2863   } else {
2864     PetscCall(MatDenseCUDAGetArrayWrite(C,&carray));
2865     PetscCall(MatDenseGetLDA(C,&clda));
2866   }
2867 
2868   PetscCall(PetscLogGpuTimeBegin());
2869  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2870   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2871   /* (re)allocate mmBuffer if not initialized or LDAs are different */
2872   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2873     size_t mmBufferSize;
2874     if (mmdata->initialized && mmdata->Blda != blda) {PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); mmdata->matBDescr = NULL;}
2875     if (!mmdata->matBDescr) {
2876       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL));
2877       mmdata->Blda = blda;
2878     }
2879 
2880     if (mmdata->initialized && mmdata->Clda != clda) {PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); mmdata->matCDescr = NULL;}
2881     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2882       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL));
2883       mmdata->Clda = clda;
2884     }
2885 
2886     if (!mat->matDescr) {
2887       stat = cusparseCreateCsr(&mat->matDescr,
2888                                csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
2889                                csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
2890                                csrmat->values->data().get(),
2891                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2892                                CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);PetscCallCUSPARSE(stat);
2893     }
2894     stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
2895                                    mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2896                                    mmdata->matCDescr,cusparse_scalartype,
2897                                    cusp->spmmAlg,&mmBufferSize);PetscCallCUSPARSE(stat);
2898     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2899       PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2900       PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer,mmBufferSize));
2901       mmdata->mmBufferSize = mmBufferSize;
2902     }
2903     mmdata->initialized = PETSC_TRUE;
2904   } else {
2905     /* to be safe, always update pointers of the mats */
2906     PetscCallCUSPARSE(cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get()));
2907     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray));
2908     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray));
2909   }
2910 
2911   /* do cusparseSpMM, which supports transpose on B */
2912   stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
2913                       mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2914                       mmdata->matCDescr,cusparse_scalartype,
2915                       cusp->spmmAlg,mmdata->mmBuffer);PetscCallCUSPARSE(stat);
2916  #else
2917   PetscInt k;
2918   /* cusparseXcsrmm does not support transpose on B */
2919   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2920     cublasHandle_t cublasv2handle;
2921     cublasStatus_t cerr;
2922 
2923     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
2924     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
2925                        B->cmap->n,B->rmap->n,
2926                        &PETSC_CUSPARSE_ONE ,barray,blda,
2927                        &PETSC_CUSPARSE_ZERO,barray,blda,
2928                        mmdata->Bt,B->cmap->n);PetscCallCUBLAS(cerr);
2929     blda = B->cmap->n;
2930     k    = B->cmap->n;
2931   } else {
2932     k    = B->rmap->n;
2933   }
2934 
2935   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2936   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
2937                            csrmat->num_entries,mat->alpha_one,mat->descr,
2938                            csrmat->values->data().get(),
2939                            csrmat->row_offsets->data().get(),
2940                            csrmat->column_indices->data().get(),
2941                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
2942                            carray,clda);PetscCallCUSPARSE(stat);
2943  #endif
2944   PetscCall(PetscLogGpuTimeEnd());
2945   PetscCall(PetscLogGpuFlops(n*2.0*csrmat->num_entries));
2946   PetscCall(MatDenseCUDARestoreArrayRead(B,&barray));
2947   if (product->type == MATPRODUCT_RARt) {
2948     PetscCall(MatDenseCUDARestoreArrayWrite(mmdata->X,&carray));
2949     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE));
2950   } else if (product->type == MATPRODUCT_PtAP) {
2951     PetscCall(MatDenseCUDARestoreArrayWrite(mmdata->X,&carray));
2952     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE));
2953   } else {
2954     PetscCall(MatDenseCUDARestoreArrayWrite(C,&carray));
2955   }
2956   if (mmdata->cisdense) {
2957     PetscCall(MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C));
2958   }
2959   if (!biscuda) {
2960     PetscCall(MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B));
2961   }
2962   PetscFunctionReturn(0);
2963 }
2964 
2965 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2966 {
2967   Mat_Product        *product = C->product;
2968   Mat                A,B;
2969   PetscInt           m,n;
2970   PetscBool          cisdense,flg;
2971   MatMatCusparse     *mmdata;
2972   Mat_SeqAIJCUSPARSE *cusp;
2973 
2974   PetscFunctionBegin;
2975   MatCheckProduct(C,1);
2976   PetscCheck(!C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
2977   A    = product->A;
2978   B    = product->B;
2979   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
2980   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2981   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2982   PetscCheck(cusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2983   switch (product->type) {
2984   case MATPRODUCT_AB:
2985     m = A->rmap->n;
2986     n = B->cmap->n;
2987     break;
2988   case MATPRODUCT_AtB:
2989     m = A->cmap->n;
2990     n = B->cmap->n;
2991     break;
2992   case MATPRODUCT_ABt:
2993     m = A->rmap->n;
2994     n = B->rmap->n;
2995     break;
2996   case MATPRODUCT_PtAP:
2997     m = B->cmap->n;
2998     n = B->cmap->n;
2999     break;
3000   case MATPRODUCT_RARt:
3001     m = B->rmap->n;
3002     n = B->rmap->n;
3003     break;
3004   default:
3005     SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
3006   }
3007   PetscCall(MatSetSizes(C,m,n,m,n));
3008   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
3009   PetscCall(PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense));
3010   PetscCall(MatSetType(C,MATSEQDENSECUDA));
3011 
3012   /* product data */
3013   PetscCall(PetscNew(&mmdata));
3014   mmdata->cisdense = cisdense;
3015  #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
3016   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
3017   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
3018     PetscCallCUDA(cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar)));
3019   }
3020  #endif
3021   /* for these products we need intermediate storage */
3022   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
3023     PetscCall(MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X));
3024     PetscCall(MatSetType(mmdata->X,MATSEQDENSECUDA));
3025     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
3026       PetscCall(MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n));
3027     } else {
3028       PetscCall(MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n));
3029     }
3030   }
3031   C->product->data    = mmdata;
3032   C->product->destroy = MatDestroy_MatMatCusparse;
3033 
3034   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
3035   PetscFunctionReturn(0);
3036 }
3037 
3038 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
3039 {
3040   Mat_Product                  *product = C->product;
3041   Mat                          A,B;
3042   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
3043   Mat_SeqAIJ                   *c = (Mat_SeqAIJ*)C->data;
3044   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
3045   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
3046   PetscBool                    flg;
3047   cusparseStatus_t             stat;
3048   MatProductType               ptype;
3049   MatMatCusparse               *mmdata;
3050 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3051   cusparseSpMatDescr_t         BmatSpDescr;
3052 #endif
3053   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
3054 
3055   PetscFunctionBegin;
3056   MatCheckProduct(C,1);
3057   PetscCheck(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
3058   PetscCall(PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg));
3059   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for C of type %s",((PetscObject)C)->type_name);
3060   mmdata = (MatMatCusparse*)C->product->data;
3061   A = product->A;
3062   B = product->B;
3063   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
3064     mmdata->reusesym = PETSC_FALSE;
3065     Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
3066     PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
3067     Cmat = Ccusp->mat;
3068     PetscCheck(Cmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]);
3069     Ccsr = (CsrMatrix*)Cmat->mat;
3070     PetscCheck(Ccsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
3071     goto finalize;
3072   }
3073   if (!c->nz) goto finalize;
3074   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
3075   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
3076   PetscCall(PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg));
3077   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
3078   PetscCheck(!A->boundtocpu,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
3079   PetscCheck(!B->boundtocpu,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
3080   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3081   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
3082   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
3083   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
3084   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
3085   PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
3086   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3087   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
3088 
3089   ptype = product->type;
3090   if (A->symmetric && ptype == MATPRODUCT_AtB) {
3091     ptype = MATPRODUCT_AB;
3092     PetscCheck(product->symbolic_used_the_fact_A_is_symmetric,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that A is symmetric");
3093   }
3094   if (B->symmetric && ptype == MATPRODUCT_ABt) {
3095     ptype = MATPRODUCT_AB;
3096     PetscCheck(product->symbolic_used_the_fact_B_is_symmetric,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that B is symmetric");
3097   }
3098   switch (ptype) {
3099   case MATPRODUCT_AB:
3100     Amat = Acusp->mat;
3101     Bmat = Bcusp->mat;
3102     break;
3103   case MATPRODUCT_AtB:
3104     Amat = Acusp->matTranspose;
3105     Bmat = Bcusp->mat;
3106     break;
3107   case MATPRODUCT_ABt:
3108     Amat = Acusp->mat;
3109     Bmat = Bcusp->matTranspose;
3110     break;
3111   default:
3112     SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
3113   }
3114   Cmat = Ccusp->mat;
3115   PetscCheck(Amat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
3116   PetscCheck(Bmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
3117   PetscCheck(Cmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[ptype]);
3118   Acsr = (CsrMatrix*)Amat->mat;
3119   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */
3120   Ccsr = (CsrMatrix*)Cmat->mat;
3121   PetscCheck(Acsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
3122   PetscCheck(Bcsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
3123   PetscCheck(Ccsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
3124   PetscCall(PetscLogGpuTimeBegin());
3125 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3126   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
3127   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3128   #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
3129     stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB,
3130                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
3131                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
3132                                mmdata->spgemmDesc);PetscCallCUSPARSE(stat);
3133   #else
3134     stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
3135                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
3136                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
3137                                mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);PetscCallCUSPARSE(stat);
3138     stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB,
3139                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
3140                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);PetscCallCUSPARSE(stat);
3141   #endif
3142 #else
3143   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB,
3144                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
3145                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
3146                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
3147                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());PetscCallCUSPARSE(stat);
3148 #endif
3149   PetscCall(PetscLogGpuFlops(mmdata->flops));
3150   PetscCallCUDA(WaitForCUDA());
3151   PetscCall(PetscLogGpuTimeEnd());
3152   C->offloadmask = PETSC_OFFLOAD_GPU;
3153 finalize:
3154   /* shorter version of MatAssemblyEnd_SeqAIJ */
3155   PetscCall(PetscInfo(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz));
3156   PetscCall(PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n"));
3157   PetscCall(PetscInfo(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax));
3158   c->reallocs         = 0;
3159   C->info.mallocs    += 0;
3160   C->info.nz_unneeded = 0;
3161   C->assembled = C->was_assembled = PETSC_TRUE;
3162   C->num_ass++;
3163   PetscFunctionReturn(0);
3164 }
3165 
3166 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
3167 {
3168   Mat_Product                  *product = C->product;
3169   Mat                          A,B;
3170   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
3171   Mat_SeqAIJ                   *a,*b,*c;
3172   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
3173   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
3174   PetscInt                     i,j,m,n,k;
3175   PetscBool                    flg;
3176   cusparseStatus_t             stat;
3177   MatProductType               ptype;
3178   MatMatCusparse               *mmdata;
3179   PetscLogDouble               flops;
3180   PetscBool                    biscompressed,ciscompressed;
3181 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3182   int64_t                      C_num_rows1, C_num_cols1, C_nnz1;
3183   cusparseSpMatDescr_t         BmatSpDescr;
3184 #else
3185   int                          cnz;
3186 #endif
3187   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
3188 
3189   PetscFunctionBegin;
3190   MatCheckProduct(C,1);
3191   PetscCheck(!C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
3192   A    = product->A;
3193   B    = product->B;
3194   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
3195   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
3196   PetscCall(PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg));
3197   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
3198   a = (Mat_SeqAIJ*)A->data;
3199   b = (Mat_SeqAIJ*)B->data;
3200   /* product data */
3201   PetscCall(PetscNew(&mmdata));
3202   C->product->data    = mmdata;
3203   C->product->destroy = MatDestroy_MatMatCusparse;
3204 
3205   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3206   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
3207   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */
3208   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
3209   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
3210   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
3211 
3212   ptype = product->type;
3213   if (A->symmetric && ptype == MATPRODUCT_AtB) {
3214     ptype = MATPRODUCT_AB;
3215     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
3216   }
3217   if (B->symmetric && ptype == MATPRODUCT_ABt) {
3218     ptype = MATPRODUCT_AB;
3219     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
3220   }
3221   biscompressed = PETSC_FALSE;
3222   ciscompressed = PETSC_FALSE;
3223   switch (ptype) {
3224   case MATPRODUCT_AB:
3225     m = A->rmap->n;
3226     n = B->cmap->n;
3227     k = A->cmap->n;
3228     Amat = Acusp->mat;
3229     Bmat = Bcusp->mat;
3230     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
3231     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
3232     break;
3233   case MATPRODUCT_AtB:
3234     m = A->cmap->n;
3235     n = B->cmap->n;
3236     k = A->rmap->n;
3237     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3238     Amat = Acusp->matTranspose;
3239     Bmat = Bcusp->mat;
3240     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
3241     break;
3242   case MATPRODUCT_ABt:
3243     m = A->rmap->n;
3244     n = B->rmap->n;
3245     k = A->cmap->n;
3246     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
3247     Amat = Acusp->mat;
3248     Bmat = Bcusp->matTranspose;
3249     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
3250     break;
3251   default:
3252     SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
3253   }
3254 
3255   /* create cusparse matrix */
3256   PetscCall(MatSetSizes(C,m,n,m,n));
3257   PetscCall(MatSetType(C,MATSEQAIJCUSPARSE));
3258   c     = (Mat_SeqAIJ*)C->data;
3259   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
3260   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
3261   Ccsr  = new CsrMatrix;
3262 
3263   c->compressedrow.use = ciscompressed;
3264   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
3265     c->compressedrow.nrows = a->compressedrow.nrows;
3266     PetscCall(PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex));
3267     PetscCall(PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows));
3268     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
3269     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
3270     Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows);
3271   } else {
3272     c->compressedrow.nrows  = 0;
3273     c->compressedrow.i      = NULL;
3274     c->compressedrow.rindex = NULL;
3275     Ccusp->workVector       = NULL;
3276     Cmat->cprowIndices      = NULL;
3277   }
3278   Ccusp->nrows    = ciscompressed ? c->compressedrow.nrows : m;
3279   Ccusp->mat      = Cmat;
3280   Ccusp->mat->mat = Ccsr;
3281   Ccsr->num_rows    = Ccusp->nrows;
3282   Ccsr->num_cols    = n;
3283   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1);
3284   PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
3285   PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
3286   PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
3287   PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar)));
3288   PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar)));
3289   PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar)));
3290   PetscCallCUDA(cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
3291   PetscCallCUDA(cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice));
3292   PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
3293   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
3294     thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0);
3295     c->nz = 0;
3296     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3297     Ccsr->values = new THRUSTARRAY(c->nz);
3298     goto finalizesym;
3299   }
3300 
3301   PetscCheck(Amat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
3302   PetscCheck(Bmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
3303   Acsr = (CsrMatrix*)Amat->mat;
3304   if (!biscompressed) {
3305     Bcsr = (CsrMatrix*)Bmat->mat;
3306 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3307     BmatSpDescr = Bmat->matDescr;
3308 #endif
3309   } else { /* we need to use row offsets for the full matrix */
3310     CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat;
3311     Bcsr = new CsrMatrix;
3312     Bcsr->num_rows       = B->rmap->n;
3313     Bcsr->num_cols       = cBcsr->num_cols;
3314     Bcsr->num_entries    = cBcsr->num_entries;
3315     Bcsr->column_indices = cBcsr->column_indices;
3316     Bcsr->values         = cBcsr->values;
3317     if (!Bcusp->rowoffsets_gpu) {
3318       Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
3319       Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
3320       PetscCall(PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt)));
3321     }
3322     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
3323     mmdata->Bcsr = Bcsr;
3324 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3325     if (Bcsr->num_rows && Bcsr->num_cols) {
3326       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries,
3327                                Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
3328                                Bcsr->values->data().get(),
3329                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
3330                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat);
3331     }
3332     BmatSpDescr = mmdata->matSpBDescr;
3333 #endif
3334   }
3335   PetscCheck(Acsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
3336   PetscCheck(Bcsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
3337   /* precompute flops count */
3338   if (ptype == MATPRODUCT_AB) {
3339     for (i=0, flops = 0; i<A->rmap->n; i++) {
3340       const PetscInt st = a->i[i];
3341       const PetscInt en = a->i[i+1];
3342       for (j=st; j<en; j++) {
3343         const PetscInt brow = a->j[j];
3344         flops += 2.*(b->i[brow+1] - b->i[brow]);
3345       }
3346     }
3347   } else if (ptype == MATPRODUCT_AtB) {
3348     for (i=0, flops = 0; i<A->rmap->n; i++) {
3349       const PetscInt anzi = a->i[i+1] - a->i[i];
3350       const PetscInt bnzi = b->i[i+1] - b->i[i];
3351       flops += (2.*anzi)*bnzi;
3352     }
3353   } else { /* TODO */
3354     flops = 0.;
3355   }
3356 
3357   mmdata->flops = flops;
3358   PetscCall(PetscLogGpuTimeBegin());
3359 
3360 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3361   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3362   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0,
3363                           NULL, NULL, NULL,
3364                           CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
3365                           CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat);
3366   PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc));
3367  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
3368  {
3369   /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
3370      We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
3371   */
3372   void*  dBuffer1 = NULL;
3373   void*  dBuffer2 = NULL;
3374   void*  dBuffer3 = NULL;
3375   /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
3376   size_t bufferSize1 = 0;
3377   size_t bufferSize2 = 0;
3378   size_t bufferSize3 = 0;
3379   size_t bufferSize4 = 0;
3380   size_t bufferSize5 = 0;
3381 
3382   /*----------------------------------------------------------------------*/
3383   /* ask bufferSize1 bytes for external memory */
3384   stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
3385                                             CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
3386                                             &bufferSize1, NULL);PetscCallCUSPARSE(stat);
3387   PetscCallCUDA(cudaMalloc((void**) &dBuffer1, bufferSize1));
3388   /* inspect the matrices A and B to understand the memory requirement for the next step */
3389   stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
3390                                             CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
3391                                             &bufferSize1, dBuffer1);PetscCallCUSPARSE(stat);
3392 
3393   /*----------------------------------------------------------------------*/
3394   stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
3395                                  CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
3396                                  &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);PetscCallCUSPARSE(stat);
3397   PetscCallCUDA(cudaMalloc((void**) &dBuffer2, bufferSize2));
3398   PetscCallCUDA(cudaMalloc((void**) &dBuffer3, bufferSize3));
3399   PetscCallCUDA(cudaMalloc((void**) &mmdata->dBuffer4, bufferSize4));
3400   stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
3401                                  CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
3402                                  &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);PetscCallCUSPARSE(stat);
3403   PetscCallCUDA(cudaFree(dBuffer1));
3404   PetscCallCUDA(cudaFree(dBuffer2));
3405 
3406   /*----------------------------------------------------------------------*/
3407   /* get matrix C non-zero entries C_nnz1 */
3408   PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
3409   c->nz = (PetscInt) C_nnz1;
3410   /* allocate matrix C */
3411   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3412   Ccsr->values         = new THRUSTARRAY(c->nz);PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3413   /* update matC with the new pointers */
3414   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
3415                                 Ccsr->values->data().get());PetscCallCUSPARSE(stat);
3416 
3417   /*----------------------------------------------------------------------*/
3418   stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
3419                                   CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
3420                                   &bufferSize5, NULL);PetscCallCUSPARSE(stat);
3421   PetscCallCUDA(cudaMalloc((void**) &mmdata->dBuffer5, bufferSize5));
3422   stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
3423                                   CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
3424                                   &bufferSize5, mmdata->dBuffer5);PetscCallCUSPARSE(stat);
3425   PetscCallCUDA(cudaFree(dBuffer3));
3426   stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB,
3427                                      Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
3428                                      cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
3429                                      mmdata->spgemmDesc);PetscCallCUSPARSE(stat);
3430   PetscCall(PetscInfo(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufferSize4/1024,bufferSize5/1024));
3431  }
3432  #else
3433   size_t bufSize2;
3434   /* ask bufferSize bytes for external memory */
3435   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB,
3436                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
3437                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
3438                                        mmdata->spgemmDesc, &bufSize2, NULL);PetscCallCUSPARSE(stat);
3439   PetscCallCUDA(cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2));
3440   /* inspect the matrices A and B to understand the memory requirement for the next step */
3441   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB,
3442                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
3443                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
3444                                        mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);PetscCallCUSPARSE(stat);
3445   /* ask bufferSize again bytes for external memory */
3446   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
3447                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
3448                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
3449                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);PetscCallCUSPARSE(stat);
3450   /* The CUSPARSE documentation is not clear, nor the API
3451      We need both buffers to perform the operations properly!
3452      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
3453      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
3454      is stored in the descriptor! What a messy API... */
3455   PetscCallCUDA(cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize));
3456   /* compute the intermediate product of A * B */
3457   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
3458                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
3459                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
3460                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);PetscCallCUSPARSE(stat);
3461   /* get matrix C non-zero entries C_nnz1 */
3462   PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
3463   c->nz = (PetscInt) C_nnz1;
3464   PetscCall(PetscInfo(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024));
3465   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3466   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3467   Ccsr->values = new THRUSTARRAY(c->nz);
3468   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3469   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
3470                                 Ccsr->values->data().get());PetscCallCUSPARSE(stat);
3471   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB,
3472                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
3473                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);PetscCallCUSPARSE(stat);
3474  #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0)
3475 #else
3476   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST));
3477   stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB,
3478                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
3479                              Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
3480                              Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
3481                              Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);PetscCallCUSPARSE(stat);
3482   c->nz = cnz;
3483   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3484   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3485   Ccsr->values = new THRUSTARRAY(c->nz);
3486   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3487 
3488   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3489   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
3490      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
3491      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
3492   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB,
3493                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
3494                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
3495                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
3496                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());PetscCallCUSPARSE(stat);
3497 #endif
3498   PetscCall(PetscLogGpuFlops(mmdata->flops));
3499   PetscCall(PetscLogGpuTimeEnd());
3500 finalizesym:
3501   c->singlemalloc = PETSC_FALSE;
3502   c->free_a       = PETSC_TRUE;
3503   c->free_ij      = PETSC_TRUE;
3504   PetscCall(PetscMalloc1(m+1,&c->i));
3505   PetscCall(PetscMalloc1(c->nz,&c->j));
3506   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
3507     PetscInt *d_i = c->i;
3508     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
3509     THRUSTINTARRAY jj(Ccsr->column_indices->size());
3510     ii   = *Ccsr->row_offsets;
3511     jj   = *Ccsr->column_indices;
3512     if (ciscompressed) d_i = c->compressedrow.i;
3513     PetscCallCUDA(cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
3514     PetscCallCUDA(cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
3515   } else {
3516     PetscInt *d_i = c->i;
3517     if (ciscompressed) d_i = c->compressedrow.i;
3518     PetscCallCUDA(cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
3519     PetscCallCUDA(cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
3520   }
3521   if (ciscompressed) { /* need to expand host row offsets */
3522     PetscInt r = 0;
3523     c->i[0] = 0;
3524     for (k = 0; k < c->compressedrow.nrows; k++) {
3525       const PetscInt next = c->compressedrow.rindex[k];
3526       const PetscInt old = c->compressedrow.i[k];
3527       for (; r < next; r++) c->i[r+1] = old;
3528     }
3529     for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows];
3530   }
3531   PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt)));
3532   PetscCall(PetscMalloc1(m,&c->ilen));
3533   PetscCall(PetscMalloc1(m,&c->imax));
3534   c->maxnz = c->nz;
3535   c->nonzerorowcnt = 0;
3536   c->rmax = 0;
3537   for (k = 0; k < m; k++) {
3538     const PetscInt nn = c->i[k+1] - c->i[k];
3539     c->ilen[k] = c->imax[k] = nn;
3540     c->nonzerorowcnt += (PetscInt)!!nn;
3541     c->rmax = PetscMax(c->rmax,nn);
3542   }
3543   PetscCall(MatMarkDiagonal_SeqAIJ(C));
3544   PetscCall(PetscMalloc1(c->nz,&c->a));
3545   Ccsr->num_entries = c->nz;
3546 
3547   C->nonzerostate++;
3548   PetscCall(PetscLayoutSetUp(C->rmap));
3549   PetscCall(PetscLayoutSetUp(C->cmap));
3550   Ccusp->nonzerostate = C->nonzerostate;
3551   C->offloadmask   = PETSC_OFFLOAD_UNALLOCATED;
3552   C->preallocated  = PETSC_TRUE;
3553   C->assembled     = PETSC_FALSE;
3554   C->was_assembled = PETSC_FALSE;
3555   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
3556     mmdata->reusesym = PETSC_TRUE;
3557     C->offloadmask   = PETSC_OFFLOAD_GPU;
3558   }
3559   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3560   PetscFunctionReturn(0);
3561 }
3562 
3563 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
3564 
3565 /* handles sparse or dense B */
3566 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
3567 {
3568   Mat_Product    *product = mat->product;
3569   PetscBool      isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE;
3570 
3571   PetscFunctionBegin;
3572   MatCheckProduct(mat,1);
3573   PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense));
3574   if (!product->A->boundtocpu && !product->B->boundtocpu) {
3575     PetscCall(PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp));
3576   }
3577   if (product->type == MATPRODUCT_ABC) {
3578     Ciscusp = PETSC_FALSE;
3579     if (!product->C->boundtocpu) {
3580       PetscCall(PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp));
3581     }
3582   }
3583   if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
3584     PetscBool usecpu = PETSC_FALSE;
3585     switch (product->type) {
3586     case MATPRODUCT_AB:
3587       if (product->api_user) {
3588         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMult","Mat");
3589         PetscCall(PetscOptionsBool("-matmatmult_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL));
3590         PetscOptionsEnd();
3591       } else {
3592         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AB","Mat");
3593         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL));
3594         PetscOptionsEnd();
3595       }
3596       break;
3597     case MATPRODUCT_AtB:
3598       if (product->api_user) {
3599         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatTransposeMatMult","Mat");
3600         PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL));
3601         PetscOptionsEnd();
3602       } else {
3603         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AtB","Mat");
3604         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL));
3605         PetscOptionsEnd();
3606       }
3607       break;
3608     case MATPRODUCT_PtAP:
3609       if (product->api_user) {
3610         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatPtAP","Mat");
3611         PetscCall(PetscOptionsBool("-matptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL));
3612         PetscOptionsEnd();
3613       } else {
3614         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_PtAP","Mat");
3615         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL));
3616         PetscOptionsEnd();
3617       }
3618       break;
3619     case MATPRODUCT_RARt:
3620       if (product->api_user) {
3621         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatRARt","Mat");
3622         PetscCall(PetscOptionsBool("-matrart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL));
3623         PetscOptionsEnd();
3624       } else {
3625         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_RARt","Mat");
3626         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL));
3627         PetscOptionsEnd();
3628       }
3629       break;
3630     case MATPRODUCT_ABC:
3631       if (product->api_user) {
3632         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMatMult","Mat");
3633         PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL));
3634         PetscOptionsEnd();
3635       } else {
3636         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_ABC","Mat");
3637         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL));
3638         PetscOptionsEnd();
3639       }
3640       break;
3641     default:
3642       break;
3643     }
3644     if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
3645   }
3646   /* dispatch */
3647   if (isdense) {
3648     switch (product->type) {
3649     case MATPRODUCT_AB:
3650     case MATPRODUCT_AtB:
3651     case MATPRODUCT_ABt:
3652     case MATPRODUCT_PtAP:
3653     case MATPRODUCT_RARt:
3654      if (product->A->boundtocpu) {
3655         PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat));
3656       } else {
3657         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
3658       }
3659       break;
3660     case MATPRODUCT_ABC:
3661       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3662       break;
3663     default:
3664       break;
3665     }
3666   } else if (Biscusp && Ciscusp) {
3667     switch (product->type) {
3668     case MATPRODUCT_AB:
3669     case MATPRODUCT_AtB:
3670     case MATPRODUCT_ABt:
3671       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3672       break;
3673     case MATPRODUCT_PtAP:
3674     case MATPRODUCT_RARt:
3675     case MATPRODUCT_ABC:
3676       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3677       break;
3678     default:
3679       break;
3680     }
3681   } else { /* fallback for AIJ */
3682     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
3683   }
3684   PetscFunctionReturn(0);
3685 }
3686 
3687 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
3688 {
3689   PetscFunctionBegin;
3690   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE));
3691   PetscFunctionReturn(0);
3692 }
3693 
3694 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
3695 {
3696   PetscFunctionBegin;
3697   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE));
3698   PetscFunctionReturn(0);
3699 }
3700 
3701 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
3702 {
3703   PetscFunctionBegin;
3704   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE));
3705   PetscFunctionReturn(0);
3706 }
3707 
3708 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
3709 {
3710   PetscFunctionBegin;
3711   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE));
3712   PetscFunctionReturn(0);
3713 }
3714 
3715 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
3716 {
3717   PetscFunctionBegin;
3718   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE));
3719   PetscFunctionReturn(0);
3720 }
3721 
3722 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y)
3723 {
3724   int i = blockIdx.x*blockDim.x + threadIdx.x;
3725   if (i < n) y[idx[i]] += x[i];
3726 }
3727 
3728 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
3729 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
3730 {
3731   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
3732   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
3733   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
3734   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
3735   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
3736   PetscBool                    compressed;
3737 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3738   PetscInt                     nx,ny;
3739 #endif
3740 
3741   PetscFunctionBegin;
3742   PetscCheck(!herm || trans,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Hermitian and not transpose not supported");
3743   if (!a->nz) {
3744     if (!yy) PetscCall(VecSet_SeqCUDA(zz,0));
3745     else PetscCall(VecCopy_SeqCUDA(yy,zz));
3746     PetscFunctionReturn(0);
3747   }
3748   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
3749   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3750   if (!trans) {
3751     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
3752     PetscCheck(matstruct,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
3753   } else {
3754     if (herm || !A->form_explicit_transpose) {
3755       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
3756       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
3757     } else {
3758       if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3759       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
3760     }
3761   }
3762   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3763   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
3764 
3765   try {
3766     PetscCall(VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray));
3767     if (yy == zz) PetscCall(VecCUDAGetArray(zz,&zarray)); /* read & write zz, so need to get uptodate zarray on GPU */
3768     else PetscCall(VecCUDAGetArrayWrite(zz,&zarray)); /* write zz, so no need to init zarray on GPU */
3769 
3770     PetscCall(PetscLogGpuTimeBegin());
3771     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3772       /* z = A x + beta y.
3773          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3774          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3775       */
3776       xptr = xarray;
3777       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
3778       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3779      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3780       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3781           allocated to accommodate different uses. So we get the length info directly from mat.
3782        */
3783       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3784         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3785         nx = mat->num_cols;
3786         ny = mat->num_rows;
3787       }
3788      #endif
3789     } else {
3790       /* z = A^T x + beta y
3791          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3792          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3793        */
3794       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
3795       dptr = zarray;
3796       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3797       if (compressed) { /* Scatter x to work vector */
3798         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3799         thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
3800                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3801                          VecCUDAEqualsReverse());
3802       }
3803      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3804       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3805         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3806         nx = mat->num_rows;
3807         ny = mat->num_cols;
3808       }
3809      #endif
3810     }
3811 
3812     /* csr_spmv does y = alpha op(A) x + beta y */
3813     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3814      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3815       PetscCheck(opA >= 0 && opA <= 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
3816       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
3817         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype));
3818         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype));
3819         PetscCallCUSPARSE(cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
3820                                                matstruct->matDescr,
3821                                                matstruct->cuSpMV[opA].vecXDescr, beta,
3822                                                matstruct->cuSpMV[opA].vecYDescr,
3823                                                cusparse_scalartype,
3824                                                cusparsestruct->spmvAlg,
3825                                                &matstruct->cuSpMV[opA].spmvBufferSize));
3826         PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize));
3827 
3828         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3829       } else {
3830         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
3831         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr));
3832         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr));
3833       }
3834 
3835       PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA,
3836                                   matstruct->alpha_one,
3837                                   matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */
3838                                   matstruct->cuSpMV[opA].vecXDescr,
3839                                   beta,
3840                                   matstruct->cuSpMV[opA].vecYDescr,
3841                                   cusparse_scalartype,
3842                                   cusparsestruct->spmvAlg,
3843                                   matstruct->cuSpMV[opA].spmvBuffer));
3844      #else
3845       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3846       PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA,
3847                                        mat->num_rows, mat->num_cols,
3848                                        mat->num_entries, matstruct->alpha_one, matstruct->descr,
3849                                        mat->values->data().get(), mat->row_offsets->data().get(),
3850                                        mat->column_indices->data().get(), xptr, beta,
3851                                        dptr));
3852      #endif
3853     } else {
3854       if (cusparsestruct->nrows) {
3855        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3856         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3857        #else
3858         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
3859         PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA,
3860                                          matstruct->alpha_one, matstruct->descr, hybMat,
3861                                          xptr, beta,
3862                                          dptr));
3863        #endif
3864       }
3865     }
3866     PetscCall(PetscLogGpuTimeEnd());
3867 
3868     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3869       if (yy) { /* MatMultAdd: zz = A*xx + yy */
3870         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3871           PetscCall(VecCopy_SeqCUDA(yy,zz)); /* zz = yy */
3872         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3873           PetscCall(VecAXPY_SeqCUDA(zz,1.0,yy)); /* zz += yy */
3874         }
3875       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3876         PetscCall(VecSet_SeqCUDA(zz,0));
3877       }
3878 
3879       /* ScatterAdd the result from work vector into the full vector when A is compressed */
3880       if (compressed) {
3881         PetscCall(PetscLogGpuTimeBegin());
3882         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred)
3883            and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
3884            prevent that. So I just add a ScatterAdd kernel.
3885          */
3886        #if 0
3887         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
3888         thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
3889                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
3890                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3891                          VecCUDAPlusEquals());
3892        #else
3893         PetscInt n = matstruct->cprowIndices->size();
3894         ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray);
3895        #endif
3896         PetscCall(PetscLogGpuTimeEnd());
3897       }
3898     } else {
3899       if (yy && yy != zz) {
3900         PetscCall(VecAXPY_SeqCUDA(zz,1.0,yy)); /* zz += yy */
3901       }
3902     }
3903     PetscCall(VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray));
3904     if (yy == zz) PetscCall(VecCUDARestoreArray(zz,&zarray));
3905     else PetscCall(VecCUDARestoreArrayWrite(zz,&zarray));
3906   } catch(char *ex) {
3907     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
3908   }
3909   if (yy) {
3910     PetscCall(PetscLogGpuFlops(2.0*a->nz));
3911   } else {
3912     PetscCall(PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt));
3913   }
3914   PetscFunctionReturn(0);
3915 }
3916 
3917 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
3918 {
3919   PetscFunctionBegin;
3920   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE));
3921   PetscFunctionReturn(0);
3922 }
3923 
3924 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
3925 {
3926   PetscObjectState   onnz = A->nonzerostate;
3927   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3928 
3929   PetscFunctionBegin;
3930   PetscCall(MatAssemblyEnd_SeqAIJ(A,mode));
3931   if (onnz != A->nonzerostate && cusp->deviceMat) {
3932 
3933     PetscCall(PetscInfo(A,"Destroy device mat since nonzerostate changed\n"));
3934     PetscCallCUDA(cudaFree(cusp->deviceMat));
3935     cusp->deviceMat = NULL;
3936   }
3937   PetscFunctionReturn(0);
3938 }
3939 
3940 /* --------------------------------------------------------------------------------*/
3941 /*@
3942    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
3943    (the default parallel PETSc format). This matrix will ultimately pushed down
3944    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
3945    assembly performance the user should preallocate the matrix storage by setting
3946    the parameter nz (or the array nnz).  By setting these parameters accurately,
3947    performance during matrix assembly can be increased by more than a factor of 50.
3948 
3949    Collective
3950 
3951    Input Parameters:
3952 +  comm - MPI communicator, set to PETSC_COMM_SELF
3953 .  m - number of rows
3954 .  n - number of columns
3955 .  nz - number of nonzeros per row (same for all rows)
3956 -  nnz - array containing the number of nonzeros in the various rows
3957          (possibly different for each row) or NULL
3958 
3959    Output Parameter:
3960 .  A - the matrix
3961 
3962    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
3963    MatXXXXSetPreallocation() paradgm instead of this routine directly.
3964    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
3965 
3966    Notes:
3967    If nnz is given then nz is ignored
3968 
3969    The AIJ format (also called the Yale sparse matrix format or
3970    compressed row storage), is fully compatible with standard Fortran 77
3971    storage.  That is, the stored row and column indices can begin at
3972    either one (as in Fortran) or zero.  See the users' manual for details.
3973 
3974    Specify the preallocated storage with either nz or nnz (not both).
3975    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
3976    allocation.  For large problems you MUST preallocate memory or you
3977    will get TERRIBLE performance, see the users' manual chapter on matrices.
3978 
3979    By default, this format uses inodes (identical nodes) when possible, to
3980    improve numerical efficiency of matrix-vector products and solves. We
3981    search for consecutive rows with the same nonzero structure, thereby
3982    reusing matrix information to achieve increased efficiency.
3983 
3984    Level: intermediate
3985 
3986 .seealso: `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATSEQAIJCUSPARSE`, `MATAIJCUSPARSE`
3987 @*/
3988 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
3989 {
3990   PetscFunctionBegin;
3991   PetscCall(MatCreate(comm,A));
3992   PetscCall(MatSetSizes(*A,m,n,m,n));
3993   PetscCall(MatSetType(*A,MATSEQAIJCUSPARSE));
3994   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz));
3995   PetscFunctionReturn(0);
3996 }
3997 
3998 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3999 {
4000   PetscFunctionBegin;
4001   if (A->factortype == MAT_FACTOR_NONE) {
4002     PetscCall(MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr));
4003   } else {
4004     PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr));
4005   }
4006   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL));
4007   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL));
4008   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetUseCPUSolve_C",NULL));
4009   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL));
4010   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL));
4011   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL));
4012   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL));
4013   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL));
4014   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL));
4015   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaijcusparse_hypre_C",NULL));
4016   PetscCall(MatDestroy_SeqAIJ(A));
4017   PetscFunctionReturn(0);
4018 }
4019 
4020 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
4021 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
4022 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
4023 {
4024   PetscFunctionBegin;
4025   PetscCall(MatDuplicate_SeqAIJ(A,cpvalues,B));
4026   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B));
4027   PetscFunctionReturn(0);
4028 }
4029 
4030 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str)
4031 {
4032   Mat_SeqAIJ         *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
4033   Mat_SeqAIJCUSPARSE *cy;
4034   Mat_SeqAIJCUSPARSE *cx;
4035   PetscScalar        *ay;
4036   const PetscScalar  *ax;
4037   CsrMatrix          *csry,*csrx;
4038 
4039   PetscFunctionBegin;
4040   cy = (Mat_SeqAIJCUSPARSE*)Y->spptr;
4041   cx = (Mat_SeqAIJCUSPARSE*)X->spptr;
4042   if (X->ops->axpy != Y->ops->axpy) {
4043     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE));
4044     PetscCall(MatAXPY_SeqAIJ(Y,a,X,str));
4045     PetscFunctionReturn(0);
4046   }
4047   /* if we are here, it means both matrices are bound to GPU */
4048   PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y));
4049   PetscCall(MatSeqAIJCUSPARSECopyToGPU(X));
4050   PetscCheck(cy->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)Y),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
4051   PetscCheck(cx->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)X),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
4052   csry = (CsrMatrix*)cy->mat->mat;
4053   csrx = (CsrMatrix*)cx->mat->mat;
4054   /* see if we can turn this into a cublas axpy */
4055   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
4056     bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin());
4057     if (eq) {
4058       eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin());
4059     }
4060     if (eq) str = SAME_NONZERO_PATTERN;
4061   }
4062   /* spgeam is buggy with one column */
4063   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
4064 
4065   if (str == SUBSET_NONZERO_PATTERN) {
4066     PetscScalar b = 1.0;
4067 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4068     size_t      bufferSize;
4069     void        *buffer;
4070 #endif
4071 
4072     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X,&ax));
4073     PetscCall(MatSeqAIJCUSPARSEGetArray(Y,&ay));
4074     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST));
4075 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4076     PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n,
4077                                                   &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
4078                                                   &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
4079                                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize));
4080     PetscCallCUDA(cudaMalloc(&buffer,bufferSize));
4081     PetscCall(PetscLogGpuTimeBegin());
4082     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
4083                                        &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
4084                                        &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
4085                                        cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer));
4086     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
4087     PetscCall(PetscLogGpuTimeEnd());
4088     PetscCallCUDA(cudaFree(buffer));
4089 #else
4090     PetscCall(PetscLogGpuTimeBegin());
4091     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
4092                                        &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
4093                                        &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
4094                                        cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get()));
4095     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
4096     PetscCall(PetscLogGpuTimeEnd());
4097 #endif
4098     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE));
4099     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X,&ax));
4100     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y,&ay));
4101     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
4102   } else if (str == SAME_NONZERO_PATTERN) {
4103     cublasHandle_t cublasv2handle;
4104     PetscBLASInt   one = 1, bnz = 1;
4105 
4106     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X,&ax));
4107     PetscCall(MatSeqAIJCUSPARSEGetArray(Y,&ay));
4108     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
4109     PetscCall(PetscBLASIntCast(x->nz,&bnz));
4110     PetscCall(PetscLogGpuTimeBegin());
4111     PetscCallCUBLAS(cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one));
4112     PetscCall(PetscLogGpuFlops(2.0*bnz));
4113     PetscCall(PetscLogGpuTimeEnd());
4114     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X,&ax));
4115     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y,&ay));
4116     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
4117   } else {
4118     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE));
4119     PetscCall(MatAXPY_SeqAIJ(Y,a,X,str));
4120   }
4121   PetscFunctionReturn(0);
4122 }
4123 
4124 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a)
4125 {
4126   Mat_SeqAIJ     *y = (Mat_SeqAIJ*)Y->data;
4127   PetscScalar    *ay;
4128   cublasHandle_t cublasv2handle;
4129   PetscBLASInt   one = 1, bnz = 1;
4130 
4131   PetscFunctionBegin;
4132   PetscCall(MatSeqAIJCUSPARSEGetArray(Y,&ay));
4133   PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
4134   PetscCall(PetscBLASIntCast(y->nz,&bnz));
4135   PetscCall(PetscLogGpuTimeBegin());
4136   PetscCallCUBLAS(cublasXscal(cublasv2handle,bnz,&a,ay,one));
4137   PetscCall(PetscLogGpuFlops(bnz));
4138   PetscCall(PetscLogGpuTimeEnd());
4139   PetscCall(MatSeqAIJCUSPARSERestoreArray(Y,&ay));
4140   PetscCall(MatSeqAIJInvalidateDiagonal(Y));
4141   PetscFunctionReturn(0);
4142 }
4143 
4144 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
4145 {
4146   PetscBool      both = PETSC_FALSE;
4147   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
4148 
4149   PetscFunctionBegin;
4150   if (A->factortype == MAT_FACTOR_NONE) {
4151     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
4152     if (spptr->mat) {
4153       CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat;
4154       if (matrix->values) {
4155         both = PETSC_TRUE;
4156         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
4157       }
4158     }
4159     if (spptr->matTranspose) {
4160       CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat;
4161       if (matrix->values) {
4162         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
4163       }
4164     }
4165   }
4166   PetscCall(PetscArrayzero(a->a,a->i[A->rmap->n]));
4167   PetscCall(MatSeqAIJInvalidateDiagonal(A));
4168   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
4169   else A->offloadmask = PETSC_OFFLOAD_CPU;
4170   PetscFunctionReturn(0);
4171 }
4172 
4173 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
4174 {
4175   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
4176 
4177   PetscFunctionBegin;
4178   if (A->factortype != MAT_FACTOR_NONE) {
4179     A->boundtocpu = flg;
4180     PetscFunctionReturn(0);
4181   }
4182   if (flg) {
4183     PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
4184 
4185     A->ops->scale                     = MatScale_SeqAIJ;
4186     A->ops->axpy                      = MatAXPY_SeqAIJ;
4187     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
4188     A->ops->mult                      = MatMult_SeqAIJ;
4189     A->ops->multadd                   = MatMultAdd_SeqAIJ;
4190     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
4191     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
4192     A->ops->multhermitiantranspose    = NULL;
4193     A->ops->multhermitiantransposeadd = NULL;
4194     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
4195     PetscCall(PetscMemzero(a->ops,sizeof(Mat_SeqAIJOps)));
4196     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL));
4197     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL));
4198     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL));
4199     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL));
4200     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL));
4201     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL));
4202   } else {
4203     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
4204     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
4205     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
4206     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
4207     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
4208     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
4209     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
4210     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
4211     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
4212     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
4213     a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJCUSPARSE;
4214     a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJCUSPARSE;
4215     a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE;
4216     a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE;
4217     a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE;
4218     a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE;
4219     a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE;
4220 
4221     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE));
4222     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE));
4223     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE));
4224     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE));
4225     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE));
4226     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE));
4227    }
4228   A->boundtocpu = flg;
4229   if (flg && a->inode.size) {
4230     a->inode.use = PETSC_TRUE;
4231   } else {
4232     a->inode.use = PETSC_FALSE;
4233   }
4234   PetscFunctionReturn(0);
4235 }
4236 
4237 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
4238 {
4239   Mat              B;
4240 
4241   PetscFunctionBegin;
4242   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */
4243   if (reuse == MAT_INITIAL_MATRIX) {
4244     PetscCall(MatDuplicate(A,MAT_COPY_VALUES,newmat));
4245   } else if (reuse == MAT_REUSE_MATRIX) {
4246     PetscCall(MatCopy(A,*newmat,SAME_NONZERO_PATTERN));
4247   }
4248   B = *newmat;
4249 
4250   PetscCall(PetscFree(B->defaultvectype));
4251   PetscCall(PetscStrallocpy(VECCUDA,&B->defaultvectype));
4252 
4253   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
4254     if (B->factortype == MAT_FACTOR_NONE) {
4255       Mat_SeqAIJCUSPARSE *spptr;
4256       PetscCall(PetscNew(&spptr));
4257       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
4258       PetscCallCUSPARSE(cusparseSetStream(spptr->handle,PetscDefaultCudaStream));
4259       spptr->format     = MAT_CUSPARSE_CSR;
4260      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4261      #if CUSPARSE_VERSION > 11301
4262       spptr->spmvAlg    = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
4263      #else
4264       spptr->spmvAlg    = CUSPARSE_CSRMV_ALG1;    /* default, since we only support csr */
4265      #endif
4266       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
4267       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
4268      #endif
4269       B->spptr = spptr;
4270     } else {
4271       Mat_SeqAIJCUSPARSETriFactors *spptr;
4272 
4273       PetscCall(PetscNew(&spptr));
4274       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
4275       PetscCallCUSPARSE(cusparseSetStream(spptr->handle,PetscDefaultCudaStream));
4276       B->spptr = spptr;
4277     }
4278     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
4279   }
4280   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
4281   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
4282   B->ops->setoption      = MatSetOption_SeqAIJCUSPARSE;
4283   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
4284   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
4285   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
4286 
4287   PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE));
4288   PetscCall(PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE));
4289   PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE));
4290 #if defined(PETSC_HAVE_HYPRE)
4291   PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijcusparse_hypre_C",MatConvert_AIJ_HYPRE));
4292 #endif
4293   PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetUseCPUSolve_C",MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE));
4294   PetscFunctionReturn(0);
4295 }
4296 
4297 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
4298 {
4299   PetscFunctionBegin;
4300   PetscCall(MatCreate_SeqAIJ(B));
4301   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B));
4302   PetscFunctionReturn(0);
4303 }
4304 
4305 /*MC
4306    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
4307 
4308    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
4309    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
4310    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
4311 
4312    Options Database Keys:
4313 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
4314 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
4315 -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
4316 +  -mat_cusparse_use_cpu_solve - Do MatSolve on CPU
4317 
4318   Level: beginner
4319 
4320 .seealso: `MatCreateSeqAIJCUSPARSE()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
4321 M*/
4322 
4323 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*);
4324 
4325 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
4326 {
4327   PetscFunctionBegin;
4328   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSEBAND,MATSEQAIJ,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band));
4329   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse));
4330   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse));
4331   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse));
4332   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse));
4333 
4334   PetscFunctionReturn(0);
4335 }
4336 
4337 static PetscErrorCode MatResetPreallocationCOO_SeqAIJCUSPARSE(Mat mat)
4338 {
4339   Mat_SeqAIJCUSPARSE* cusp = (Mat_SeqAIJCUSPARSE*)mat->spptr;
4340 
4341   PetscFunctionBegin;
4342   if (!cusp) PetscFunctionReturn(0);
4343   delete cusp->cooPerm;
4344   delete cusp->cooPerm_a;
4345   cusp->cooPerm = NULL;
4346   cusp->cooPerm_a = NULL;
4347   if (cusp->use_extended_coo) {
4348     PetscCallCUDA(cudaFree(cusp->jmap_d));
4349     PetscCallCUDA(cudaFree(cusp->perm_d));
4350   }
4351   cusp->use_extended_coo = PETSC_FALSE;
4352   PetscFunctionReturn(0);
4353 }
4354 
4355 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
4356 {
4357   PetscFunctionBegin;
4358   if (*cusparsestruct) {
4359     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format));
4360     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format));
4361     delete (*cusparsestruct)->workVector;
4362     delete (*cusparsestruct)->rowoffsets_gpu;
4363     delete (*cusparsestruct)->cooPerm;
4364     delete (*cusparsestruct)->cooPerm_a;
4365     delete (*cusparsestruct)->csr2csc_i;
4366     if ((*cusparsestruct)->handle) PetscCallCUSPARSE(cusparseDestroy((*cusparsestruct)->handle));
4367     if ((*cusparsestruct)->jmap_d) PetscCallCUDA(cudaFree((*cusparsestruct)->jmap_d));
4368     if ((*cusparsestruct)->perm_d) PetscCallCUDA(cudaFree((*cusparsestruct)->perm_d));
4369     PetscCall(PetscFree(*cusparsestruct));
4370   }
4371   PetscFunctionReturn(0);
4372 }
4373 
4374 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
4375 {
4376   PetscFunctionBegin;
4377   if (*mat) {
4378     delete (*mat)->values;
4379     delete (*mat)->column_indices;
4380     delete (*mat)->row_offsets;
4381     delete *mat;
4382     *mat = 0;
4383   }
4384   PetscFunctionReturn(0);
4385 }
4386 
4387 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
4388 {
4389   PetscFunctionBegin;
4390   if (*trifactor) {
4391     if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr));
4392     if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparseDestroyCsrsvInfo((*trifactor)->solveInfo));
4393     PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat));
4394     if ((*trifactor)->solveBuffer)   PetscCallCUDA(cudaFree((*trifactor)->solveBuffer));
4395     if ((*trifactor)->AA_h)   PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h));
4396    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4397     if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer));
4398    #endif
4399     PetscCall(PetscFree(*trifactor));
4400   }
4401   PetscFunctionReturn(0);
4402 }
4403 
4404 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
4405 {
4406   CsrMatrix        *mat;
4407 
4408   PetscFunctionBegin;
4409   if (*matstruct) {
4410     if ((*matstruct)->mat) {
4411       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
4412        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4413         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
4414        #else
4415         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
4416         PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat));
4417        #endif
4418       } else {
4419         mat = (CsrMatrix*)(*matstruct)->mat;
4420         CsrMatrix_Destroy(&mat);
4421       }
4422     }
4423     if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr));
4424     delete (*matstruct)->cprowIndices;
4425     if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one));
4426     if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero));
4427     if ((*matstruct)->beta_one)  PetscCallCUDA(cudaFree((*matstruct)->beta_one));
4428 
4429    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4430     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
4431     if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr));
4432     for (int i=0; i<3; i++) {
4433       if (mdata->cuSpMV[i].initialized) {
4434         PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer));
4435         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr));
4436         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr));
4437       }
4438     }
4439    #endif
4440     delete *matstruct;
4441     *matstruct = NULL;
4442   }
4443   PetscFunctionReturn(0);
4444 }
4445 
4446 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p* trifactors)
4447 {
4448   Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors;
4449 
4450   PetscFunctionBegin;
4451   if (fs) {
4452     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtr));
4453     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtr));
4454     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose));
4455     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose));
4456     delete fs->rpermIndices;
4457     delete fs->cpermIndices;
4458     delete fs->workVector;
4459     fs->rpermIndices = NULL;
4460     fs->cpermIndices = NULL;
4461     fs->workVector = NULL;
4462     if (fs->a_band_d)   PetscCallCUDA(cudaFree(fs->a_band_d));
4463     if (fs->i_band_d)   PetscCallCUDA(cudaFree(fs->i_band_d));
4464     fs->init_dev_prop = PETSC_FALSE;
4465    #if CUSPARSE_VERSION >= 11500
4466     PetscCallCUDA(cudaFree(fs->csrRowPtr));
4467     PetscCallCUDA(cudaFree(fs->csrColIdx));
4468     PetscCallCUDA(cudaFree(fs->csrVal));
4469     PetscCallCUDA(cudaFree(fs->X));
4470     PetscCallCUDA(cudaFree(fs->Y));
4471     PetscCallCUDA(cudaFree(fs->factBuffer_M));
4472     PetscCallCUDA(cudaFree(fs->spsvBuffer_L));
4473     PetscCallCUDA(cudaFree(fs->spsvBuffer_Lt));
4474     PetscCallCUDA(cudaFree(fs->spsvBuffer_U));
4475     PetscCallCUDA(cudaFree(fs->spsvBuffer_Ut));
4476     PetscCallCUSPARSE(cusparseDestroyMatDescr(fs->matDescr_M));
4477     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_L));
4478     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_U));
4479     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_L));
4480     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Lt));
4481     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_U));
4482     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Ut));
4483     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_X));
4484     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_Y));
4485     PetscCallCUSPARSE(cusparseDestroyCsrilu02Info(fs->ilu0Info_M));
4486     PetscCallCUSPARSE(cusparseDestroyCsric02Info(fs->ic0Info_M));
4487     fs->builtSolveTranspose = PETSC_FALSE;
4488    #endif
4489   }
4490   PetscFunctionReturn(0);
4491 }
4492 
4493 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
4494 {
4495   cusparseHandle_t handle;
4496 
4497   PetscFunctionBegin;
4498   if (*trifactors) {
4499     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors));
4500     if (handle = (*trifactors)->handle) {
4501       PetscCallCUSPARSE(cusparseDestroy(handle));
4502     }
4503     PetscCall(PetscFree(*trifactors));
4504   }
4505   PetscFunctionReturn(0);
4506 }
4507 
4508 struct IJCompare
4509 {
4510   __host__ __device__
4511   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
4512   {
4513     if (t1.get<0>() < t2.get<0>()) return true;
4514     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
4515     return false;
4516   }
4517 };
4518 
4519 struct IJEqual
4520 {
4521   __host__ __device__
4522   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
4523   {
4524     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
4525     return true;
4526   }
4527 };
4528 
4529 struct IJDiff
4530 {
4531   __host__ __device__
4532   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
4533   {
4534     return t1 == t2 ? 0 : 1;
4535   }
4536 };
4537 
4538 struct IJSum
4539 {
4540   __host__ __device__
4541   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
4542   {
4543     return t1||t2;
4544   }
4545 };
4546 
4547 #include <thrust/iterator/discard_iterator.h>
4548 /* Associated with MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic() */
4549 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode)
4550 {
4551   Mat_SeqAIJCUSPARSE                    *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4552   Mat_SeqAIJ                            *a = (Mat_SeqAIJ*)A->data;
4553   THRUSTARRAY                           *cooPerm_v = NULL;
4554   thrust::device_ptr<const PetscScalar> d_v;
4555   CsrMatrix                             *matrix;
4556   PetscInt                              n;
4557 
4558   PetscFunctionBegin;
4559   PetscCheck(cusp,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct");
4560   PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix");
4561   if (!cusp->cooPerm) {
4562     PetscCall(MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY));
4563     PetscCall(MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY));
4564     PetscFunctionReturn(0);
4565   }
4566   matrix = (CsrMatrix*)cusp->mat->mat;
4567   PetscCheck(matrix->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4568   if (!v) {
4569     if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
4570     goto finalize;
4571   }
4572   n = cusp->cooPerm->size();
4573   if (isCudaMem(v)) {
4574     d_v = thrust::device_pointer_cast(v);
4575   } else {
4576     cooPerm_v = new THRUSTARRAY(n);
4577     cooPerm_v->assign(v,v+n);
4578     d_v = cooPerm_v->data();
4579     PetscCall(PetscLogCpuToGpu(n*sizeof(PetscScalar)));
4580   }
4581   PetscCall(PetscLogGpuTimeBegin());
4582   if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
4583     if (cusp->cooPerm_a) { /* there are repeated entries in d_v[], and we need to add these them */
4584       THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
4585       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
4586       /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output)
4587         cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[].
4588         cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero.
4589       */
4590       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
4591       thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>());
4592       delete cooPerm_w;
4593     } else {
4594       /* all nonzeros in d_v[] are unique entries */
4595       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
4596                                                                 matrix->values->begin()));
4597       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
4598                                                                 matrix->values->end()));
4599       thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]]  */
4600     }
4601   } else {
4602     if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
4603       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
4604       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
4605     } else {
4606       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
4607                                                                 matrix->values->begin()));
4608       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
4609                                                                 matrix->values->end()));
4610       thrust::for_each(zibit,zieit,VecCUDAEquals());
4611     }
4612   }
4613   PetscCall(PetscLogGpuTimeEnd());
4614 finalize:
4615   delete cooPerm_v;
4616   A->offloadmask = PETSC_OFFLOAD_GPU;
4617   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4618   /* shorter version of MatAssemblyEnd_SeqAIJ */
4619   PetscCall(PetscInfo(A,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",A->rmap->n,A->cmap->n,a->nz));
4620   PetscCall(PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n"));
4621   PetscCall(PetscInfo(A,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",a->rmax));
4622   a->reallocs         = 0;
4623   A->info.mallocs    += 0;
4624   A->info.nz_unneeded = 0;
4625   A->assembled = A->was_assembled = PETSC_TRUE;
4626   A->num_ass++;
4627   PetscFunctionReturn(0);
4628 }
4629 
4630 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
4631 {
4632   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4633 
4634   PetscFunctionBegin;
4635   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4636   if (!cusp) PetscFunctionReturn(0);
4637   if (destroy) {
4638     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format));
4639     delete cusp->csr2csc_i;
4640     cusp->csr2csc_i = NULL;
4641   }
4642   A->transupdated = PETSC_FALSE;
4643   PetscFunctionReturn(0);
4644 }
4645 
4646 #include <thrust/binary_search.h>
4647 /* 'Basic' means it only works when coo_i[] and coo_j[] do not contain negative indices */
4648 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(Mat A, PetscCount n, const PetscInt coo_i[], const PetscInt coo_j[])
4649 {
4650   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4651   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
4652   PetscInt           cooPerm_n, nzr = 0;
4653 
4654   PetscFunctionBegin;
4655   PetscCall(PetscLayoutSetUp(A->rmap));
4656   PetscCall(PetscLayoutSetUp(A->cmap));
4657   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
4658   if (n != cooPerm_n) {
4659     delete cusp->cooPerm;
4660     delete cusp->cooPerm_a;
4661     cusp->cooPerm = NULL;
4662     cusp->cooPerm_a = NULL;
4663   }
4664   if (n) {
4665     THRUSTINTARRAY d_i(n);
4666     THRUSTINTARRAY d_j(n);
4667     THRUSTINTARRAY ii(A->rmap->n);
4668 
4669     if (!cusp->cooPerm)   { cusp->cooPerm   = new THRUSTINTARRAY(n); }
4670     if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); }
4671 
4672     PetscCall(PetscLogCpuToGpu(2.*n*sizeof(PetscInt)));
4673     d_i.assign(coo_i,coo_i+n);
4674     d_j.assign(coo_j,coo_j+n);
4675 
4676     /* Ex.
4677       n = 6
4678       coo_i = [3,3,1,4,1,4]
4679       coo_j = [3,2,2,5,2,6]
4680     */
4681     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin()));
4682     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end()));
4683 
4684     PetscCall(PetscLogGpuTimeBegin());
4685     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
4686     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */
4687     *cusp->cooPerm_a = d_i; /* copy the sorted array */
4688     THRUSTINTARRAY w = d_j;
4689 
4690     /*
4691       d_i     = [1,1,3,3,4,4]
4692       d_j     = [2,2,2,3,5,6]
4693       cooPerm = [2,4,1,0,3,5]
4694     */
4695     auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */
4696 
4697     /*
4698       d_i     = [1,3,3,4,4,x]
4699                             ^ekey
4700       d_j     = [2,2,3,5,6,x]
4701                            ^nekye
4702     */
4703     if (nekey == ekey) { /* all entries are unique */
4704       delete cusp->cooPerm_a;
4705       cusp->cooPerm_a = NULL;
4706     } else { /* Stefano: I couldn't come up with a more elegant algorithm */
4707       /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */
4708       adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/
4709       adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff());                                              /* w:         [2,2,2,3,5,6] => [2,0,0,1,1,1]*/
4710       (*cusp->cooPerm_a)[0] = 0; /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */
4711       w[0] = 0;
4712       thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); /* cooPerm_a =          [0,0,1,1,1,1]*/
4713       thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/
4714     }
4715     thrust::counting_iterator<PetscInt> search_begin(0);
4716     thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */
4717                         search_begin, search_begin + A->rmap->n,  /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */
4718                         ii.begin()); /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */
4719     PetscCall(PetscLogGpuTimeEnd());
4720 
4721     PetscCall(MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i));
4722     a->singlemalloc = PETSC_FALSE;
4723     a->free_a       = PETSC_TRUE;
4724     a->free_ij      = PETSC_TRUE;
4725     PetscCall(PetscMalloc1(A->rmap->n+1,&a->i));
4726     a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */
4727     PetscCallCUDA(cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost));
4728     a->nz = a->maxnz = a->i[A->rmap->n];
4729     a->rmax = 0;
4730     PetscCall(PetscMalloc1(a->nz,&a->a));
4731     PetscCall(PetscMalloc1(a->nz,&a->j));
4732     PetscCallCUDA(cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost));
4733     if (!a->ilen) PetscCall(PetscMalloc1(A->rmap->n,&a->ilen));
4734     if (!a->imax) PetscCall(PetscMalloc1(A->rmap->n,&a->imax));
4735     for (PetscInt i = 0; i < A->rmap->n; i++) {
4736       const PetscInt nnzr = a->i[i+1] - a->i[i];
4737       nzr += (PetscInt)!!(nnzr);
4738       a->ilen[i] = a->imax[i] = nnzr;
4739       a->rmax = PetscMax(a->rmax,nnzr);
4740     }
4741     a->nonzerorowcnt = nzr;
4742     A->preallocated = PETSC_TRUE;
4743     PetscCall(PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt)));
4744     PetscCall(MatMarkDiagonal_SeqAIJ(A));
4745   } else {
4746     PetscCall(MatSeqAIJSetPreallocation(A,0,NULL));
4747   }
4748   PetscCall(MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE));
4749 
4750   /* We want to allocate the CUSPARSE struct for matvec now.
4751      The code is so convoluted now that I prefer to copy zeros */
4752   PetscCall(PetscArrayzero(a->a,a->nz));
4753   PetscCall(MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6));
4754   A->offloadmask = PETSC_OFFLOAD_CPU;
4755   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4756   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE));
4757   PetscFunctionReturn(0);
4758 }
4759 
4760 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[])
4761 {
4762   Mat_SeqAIJ         *seq;
4763   Mat_SeqAIJCUSPARSE *dev;
4764   PetscBool          coo_basic = PETSC_TRUE;
4765   PetscMemType       mtype = PETSC_MEMTYPE_DEVICE;
4766 
4767   PetscFunctionBegin;
4768   PetscCall(MatResetPreallocationCOO_SeqAIJ(mat));
4769   PetscCall(MatResetPreallocationCOO_SeqAIJCUSPARSE(mat));
4770   if (coo_i) {
4771     PetscCall(PetscGetMemType(coo_i,&mtype));
4772     if (PetscMemTypeHost(mtype)) {
4773       for (PetscCount k=0; k<coo_n; k++) {
4774         if (coo_i[k] < 0 || coo_j[k] < 0) {coo_basic = PETSC_FALSE; break;}
4775       }
4776     }
4777   }
4778 
4779   if (coo_basic) { /* i,j are on device or do not contain negative indices */
4780     PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(mat,coo_n,coo_i,coo_j));
4781   } else {
4782     PetscCall(MatSetPreallocationCOO_SeqAIJ(mat,coo_n,coo_i,coo_j));
4783     mat->offloadmask = PETSC_OFFLOAD_CPU;
4784     PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat));
4785     seq  = static_cast<Mat_SeqAIJ*>(mat->data);
4786     dev  = static_cast<Mat_SeqAIJCUSPARSE*>(mat->spptr);
4787     PetscCallCUDA(cudaMalloc((void**)&dev->jmap_d,(seq->nz+1)*sizeof(PetscCount)));
4788     PetscCallCUDA(cudaMemcpy(dev->jmap_d,seq->jmap,(seq->nz+1)*sizeof(PetscCount),cudaMemcpyHostToDevice));
4789     PetscCallCUDA(cudaMalloc((void**)&dev->perm_d,seq->Atot*sizeof(PetscCount)));
4790     PetscCallCUDA(cudaMemcpy(dev->perm_d,seq->perm,seq->Atot*sizeof(PetscCount),cudaMemcpyHostToDevice));
4791     dev->use_extended_coo = PETSC_TRUE;
4792   }
4793   PetscFunctionReturn(0);
4794 }
4795 
4796 __global__ static void MatAddCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount jmap[],const PetscCount perm[],InsertMode imode,PetscScalar a[])
4797 {
4798   PetscCount        i = blockIdx.x*blockDim.x + threadIdx.x;
4799   const PetscCount  grid_size = gridDim.x * blockDim.x;
4800   for (; i<nnz; i+= grid_size) {
4801     PetscScalar sum = 0.0;
4802     for (PetscCount k=jmap[i]; k<jmap[i+1]; k++) sum += kv[perm[k]];
4803     a[i] = (imode == INSERT_VALUES? 0.0 : a[i]) + sum;
4804   }
4805 }
4806 
4807 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
4808 {
4809   Mat_SeqAIJ          *seq = (Mat_SeqAIJ*)A->data;
4810   Mat_SeqAIJCUSPARSE  *dev = (Mat_SeqAIJCUSPARSE*)A->spptr;
4811   PetscCount          Annz = seq->nz;
4812   PetscMemType        memtype;
4813   const PetscScalar   *v1 = v;
4814   PetscScalar         *Aa;
4815 
4816   PetscFunctionBegin;
4817   if (dev->use_extended_coo) {
4818     PetscCall(PetscGetMemType(v,&memtype));
4819     if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
4820       PetscCallCUDA(cudaMalloc((void**)&v1,seq->coo_n*sizeof(PetscScalar)));
4821       PetscCallCUDA(cudaMemcpy((void*)v1,v,seq->coo_n*sizeof(PetscScalar),cudaMemcpyHostToDevice));
4822     }
4823 
4824     if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A,&Aa));
4825     else PetscCall(MatSeqAIJCUSPARSEGetArray(A,&Aa));
4826 
4827     if (Annz) {
4828       MatAddCOOValues<<<(Annz+255)/256,256>>>(v1,Annz,dev->jmap_d,dev->perm_d,imode,Aa);
4829       PetscCallCUDA(cudaPeekAtLastError());
4830     }
4831 
4832     if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A,&Aa));
4833     else PetscCall(MatSeqAIJCUSPARSERestoreArray(A,&Aa));
4834 
4835     if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void*)v1));
4836   } else {
4837     PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(A,v,imode));
4838   }
4839   PetscFunctionReturn(0);
4840 }
4841 
4842 /*@C
4843     MatSeqAIJCUSPARSEGetIJ - returns the device row storage i and j indices for MATSEQAIJCUSPARSE matrices.
4844 
4845    Not collective
4846 
4847     Input Parameters:
4848 +   A - the matrix
4849 -   compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form
4850 
4851     Output Parameters:
4852 +   ia - the CSR row pointers
4853 -   ja - the CSR column indices
4854 
4855     Level: developer
4856 
4857     Notes:
4858       When compressed is true, the CSR structure does not contain empty rows
4859 
4860 .seealso: `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()`
4861 @*/
4862 PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int** i, const int **j)
4863 {
4864   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4865   CsrMatrix          *csr;
4866   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
4867 
4868   PetscFunctionBegin;
4869   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4870   if (!i || !j) PetscFunctionReturn(0);
4871   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4872   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4873   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4874   PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4875   csr = (CsrMatrix*)cusp->mat->mat;
4876   if (i) {
4877     if (!compressed && a->compressedrow.use) { /* need full row offset */
4878       if (!cusp->rowoffsets_gpu) {
4879         cusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
4880         cusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
4881         PetscCall(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt)));
4882       }
4883       *i = cusp->rowoffsets_gpu->data().get();
4884     } else *i = csr->row_offsets->data().get();
4885   }
4886   if (j) *j = csr->column_indices->data().get();
4887   PetscFunctionReturn(0);
4888 }
4889 
4890 /*@C
4891     MatSeqAIJCUSPARSERestoreIJ - restore the device row storage i and j indices obtained with MatSeqAIJCUSPARSEGetIJ()
4892 
4893    Not collective
4894 
4895     Input Parameters:
4896 +   A - the matrix
4897 -   compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form
4898 
4899     Output Parameters:
4900 +   ia - the CSR row pointers
4901 -   ja - the CSR column indices
4902 
4903     Level: developer
4904 
4905 .seealso: `MatSeqAIJCUSPARSEGetIJ()`
4906 @*/
4907 PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int** i, const int **j)
4908 {
4909   PetscFunctionBegin;
4910   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4911   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4912   if (i) *i = NULL;
4913   if (j) *j = NULL;
4914   PetscFunctionReturn(0);
4915 }
4916 
4917 /*@C
4918    MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
4919 
4920    Not Collective
4921 
4922    Input Parameter:
4923 .   A - a MATSEQAIJCUSPARSE matrix
4924 
4925    Output Parameter:
4926 .   a - pointer to the device data
4927 
4928    Level: developer
4929 
4930    Notes: may trigger host-device copies if up-to-date matrix data is on host
4931 
4932 .seealso: `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()`
4933 @*/
4934 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a)
4935 {
4936   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4937   CsrMatrix          *csr;
4938 
4939   PetscFunctionBegin;
4940   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4941   PetscValidPointer(a,2);
4942   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4943   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4944   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4945   PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4946   csr = (CsrMatrix*)cusp->mat->mat;
4947   PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4948   *a = csr->values->data().get();
4949   PetscFunctionReturn(0);
4950 }
4951 
4952 /*@C
4953    MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from MatSeqAIJCUSPARSEGetArrayRead()
4954 
4955    Not Collective
4956 
4957    Input Parameter:
4958 .   A - a MATSEQAIJCUSPARSE matrix
4959 
4960    Output Parameter:
4961 .   a - pointer to the device data
4962 
4963    Level: developer
4964 
4965 .seealso: `MatSeqAIJCUSPARSEGetArrayRead()`
4966 @*/
4967 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a)
4968 {
4969   PetscFunctionBegin;
4970   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4971   PetscValidPointer(a,2);
4972   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4973   *a = NULL;
4974   PetscFunctionReturn(0);
4975 }
4976 
4977 /*@C
4978    MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
4979 
4980    Not Collective
4981 
4982    Input Parameter:
4983 .   A - a MATSEQAIJCUSPARSE matrix
4984 
4985    Output Parameter:
4986 .   a - pointer to the device data
4987 
4988    Level: developer
4989 
4990    Notes: may trigger host-device copies if up-to-date matrix data is on host
4991 
4992 .seealso: `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()`
4993 @*/
4994 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a)
4995 {
4996   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4997   CsrMatrix          *csr;
4998 
4999   PetscFunctionBegin;
5000   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
5001   PetscValidPointer(a,2);
5002   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
5003   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
5004   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
5005   PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
5006   csr = (CsrMatrix*)cusp->mat->mat;
5007   PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
5008   *a = csr->values->data().get();
5009   A->offloadmask = PETSC_OFFLOAD_GPU;
5010   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE));
5011   PetscFunctionReturn(0);
5012 }
5013 /*@C
5014    MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from MatSeqAIJCUSPARSEGetArray()
5015 
5016    Not Collective
5017 
5018    Input Parameter:
5019 .   A - a MATSEQAIJCUSPARSE matrix
5020 
5021    Output Parameter:
5022 .   a - pointer to the device data
5023 
5024    Level: developer
5025 
5026 .seealso: `MatSeqAIJCUSPARSEGetArray()`
5027 @*/
5028 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a)
5029 {
5030   PetscFunctionBegin;
5031   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
5032   PetscValidPointer(a,2);
5033   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
5034   PetscCall(MatSeqAIJInvalidateDiagonal(A));
5035   PetscCall(PetscObjectStateIncrease((PetscObject)A));
5036   *a = NULL;
5037   PetscFunctionReturn(0);
5038 }
5039 
5040 /*@C
5041    MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
5042 
5043    Not Collective
5044 
5045    Input Parameter:
5046 .   A - a MATSEQAIJCUSPARSE matrix
5047 
5048    Output Parameter:
5049 .   a - pointer to the device data
5050 
5051    Level: developer
5052 
5053    Notes: does not trigger host-device copies and flags data validity on the GPU
5054 
5055 .seealso: `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()`
5056 @*/
5057 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a)
5058 {
5059   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
5060   CsrMatrix          *csr;
5061 
5062   PetscFunctionBegin;
5063   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
5064   PetscValidPointer(a,2);
5065   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
5066   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
5067   PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
5068   csr = (CsrMatrix*)cusp->mat->mat;
5069   PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
5070   *a = csr->values->data().get();
5071   A->offloadmask = PETSC_OFFLOAD_GPU;
5072   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE));
5073   PetscFunctionReturn(0);
5074 }
5075 
5076 /*@C
5077    MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from MatSeqAIJCUSPARSEGetArrayWrite()
5078 
5079    Not Collective
5080 
5081    Input Parameter:
5082 .   A - a MATSEQAIJCUSPARSE matrix
5083 
5084    Output Parameter:
5085 .   a - pointer to the device data
5086 
5087    Level: developer
5088 
5089 .seealso: `MatSeqAIJCUSPARSEGetArrayWrite()`
5090 @*/
5091 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a)
5092 {
5093   PetscFunctionBegin;
5094   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
5095   PetscValidPointer(a,2);
5096   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
5097   PetscCall(MatSeqAIJInvalidateDiagonal(A));
5098   PetscCall(PetscObjectStateIncrease((PetscObject)A));
5099   *a = NULL;
5100   PetscFunctionReturn(0);
5101 }
5102 
5103 struct IJCompare4
5104 {
5105   __host__ __device__
5106   inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
5107   {
5108     if (t1.get<0>() < t2.get<0>()) return true;
5109     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
5110     return false;
5111   }
5112 };
5113 
5114 struct Shift
5115 {
5116   int _shift;
5117 
5118   Shift(int shift) : _shift(shift) {}
5119   __host__ __device__
5120   inline int operator() (const int &c)
5121   {
5122     return c + _shift;
5123   }
5124 };
5125 
5126 /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */
5127 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
5128 {
5129   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c;
5130   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp;
5131   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
5132   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
5133   PetscInt                     Annz,Bnnz;
5134   cusparseStatus_t             stat;
5135   PetscInt                     i,m,n,zero = 0;
5136 
5137   PetscFunctionBegin;
5138   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
5139   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
5140   PetscValidPointer(C,4);
5141   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
5142   PetscCheckTypeName(B,MATSEQAIJCUSPARSE);
5143   PetscCheck(A->rmap->n == B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n);
5144   PetscCheck(reuse != MAT_INPLACE_MATRIX,PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
5145   PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
5146   PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
5147   if (reuse == MAT_INITIAL_MATRIX) {
5148     m     = A->rmap->n;
5149     n     = A->cmap->n + B->cmap->n;
5150     PetscCall(MatCreate(PETSC_COMM_SELF,C));
5151     PetscCall(MatSetSizes(*C,m,n,m,n));
5152     PetscCall(MatSetType(*C,MATSEQAIJCUSPARSE));
5153     c     = (Mat_SeqAIJ*)(*C)->data;
5154     Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
5155     Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
5156     Ccsr  = new CsrMatrix;
5157     Cmat->cprowIndices      = NULL;
5158     c->compressedrow.use    = PETSC_FALSE;
5159     c->compressedrow.nrows  = 0;
5160     c->compressedrow.i      = NULL;
5161     c->compressedrow.rindex = NULL;
5162     Ccusp->workVector       = NULL;
5163     Ccusp->nrows    = m;
5164     Ccusp->mat      = Cmat;
5165     Ccusp->mat->mat = Ccsr;
5166     Ccsr->num_rows  = m;
5167     Ccsr->num_cols  = n;
5168     PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
5169     PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
5170     PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
5171     PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar)));
5172     PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar)));
5173     PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar)));
5174     PetscCallCUDA(cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
5175     PetscCallCUDA(cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice));
5176     PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
5177     PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
5178     PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
5179     PetscCheck(Acusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
5180     PetscCheck(Bcusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
5181 
5182     Acsr = (CsrMatrix*)Acusp->mat->mat;
5183     Bcsr = (CsrMatrix*)Bcusp->mat->mat;
5184     Annz = (PetscInt)Acsr->column_indices->size();
5185     Bnnz = (PetscInt)Bcsr->column_indices->size();
5186     c->nz = Annz + Bnnz;
5187     Ccsr->row_offsets = new THRUSTINTARRAY32(m+1);
5188     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
5189     Ccsr->values = new THRUSTARRAY(c->nz);
5190     Ccsr->num_entries = c->nz;
5191     Ccusp->cooPerm = new THRUSTINTARRAY(c->nz);
5192     if (c->nz) {
5193       auto Acoo = new THRUSTINTARRAY32(Annz);
5194       auto Bcoo = new THRUSTINTARRAY32(Bnnz);
5195       auto Ccoo = new THRUSTINTARRAY32(c->nz);
5196       THRUSTINTARRAY32 *Aroff,*Broff;
5197 
5198       if (a->compressedrow.use) { /* need full row offset */
5199         if (!Acusp->rowoffsets_gpu) {
5200           Acusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
5201           Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
5202           PetscCall(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt)));
5203         }
5204         Aroff = Acusp->rowoffsets_gpu;
5205       } else Aroff = Acsr->row_offsets;
5206       if (b->compressedrow.use) { /* need full row offset */
5207         if (!Bcusp->rowoffsets_gpu) {
5208           Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
5209           Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
5210           PetscCall(PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt)));
5211         }
5212         Broff = Bcusp->rowoffsets_gpu;
5213       } else Broff = Bcsr->row_offsets;
5214       PetscCall(PetscLogGpuTimeBegin());
5215       stat = cusparseXcsr2coo(Acusp->handle,
5216                               Aroff->data().get(),
5217                               Annz,
5218                               m,
5219                               Acoo->data().get(),
5220                               CUSPARSE_INDEX_BASE_ZERO);PetscCallCUSPARSE(stat);
5221       stat = cusparseXcsr2coo(Bcusp->handle,
5222                               Broff->data().get(),
5223                               Bnnz,
5224                               m,
5225                               Bcoo->data().get(),
5226                               CUSPARSE_INDEX_BASE_ZERO);PetscCallCUSPARSE(stat);
5227       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
5228       auto Aperm = thrust::make_constant_iterator(1);
5229       auto Bperm = thrust::make_constant_iterator(0);
5230 #if PETSC_PKG_CUDA_VERSION_GE(10,0,0)
5231       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n));
5232       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n));
5233 #else
5234       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
5235       auto Bcib = Bcsr->column_indices->begin();
5236       auto Bcie = Bcsr->column_indices->end();
5237       thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n));
5238 #endif
5239       auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz);
5240       auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm));
5241       auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm));
5242       auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm));
5243       auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm));
5244       auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin()));
5245       auto p1 = Ccusp->cooPerm->begin();
5246       auto p2 = Ccusp->cooPerm->begin();
5247       thrust::advance(p2,Annz);
5248       PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4()));
5249 #if PETSC_PKG_CUDA_VERSION_LT(10,0,0)
5250       thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n));
5251 #endif
5252       auto cci = thrust::make_counting_iterator(zero);
5253       auto cce = thrust::make_counting_iterator(c->nz);
5254 #if 0 //Errors on SUMMIT cuda 11.1.0
5255       PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
5256 #else
5257       auto pred = thrust::identity<int>();
5258       PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred));
5259       PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred));
5260 #endif
5261       stat = cusparseXcoo2csr(Ccusp->handle,
5262                               Ccoo->data().get(),
5263                               c->nz,
5264                               m,
5265                               Ccsr->row_offsets->data().get(),
5266                               CUSPARSE_INDEX_BASE_ZERO);PetscCallCUSPARSE(stat);
5267       PetscCall(PetscLogGpuTimeEnd());
5268       delete wPerm;
5269       delete Acoo;
5270       delete Bcoo;
5271       delete Ccoo;
5272 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
5273       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries,
5274                                Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(),
5275                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
5276                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat);
5277 #endif
5278       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
5279         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
5280         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
5281         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
5282         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
5283         CsrMatrix *CcsrT = new CsrMatrix;
5284         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
5285         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
5286 
5287         (*C)->form_explicit_transpose = PETSC_TRUE;
5288         (*C)->transupdated = PETSC_TRUE;
5289         Ccusp->rowoffsets_gpu = NULL;
5290         CmatT->cprowIndices = NULL;
5291         CmatT->mat = CcsrT;
5292         CcsrT->num_rows = n;
5293         CcsrT->num_cols = m;
5294         CcsrT->num_entries = c->nz;
5295 
5296         CcsrT->row_offsets = new THRUSTINTARRAY32(n+1);
5297         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
5298         CcsrT->values = new THRUSTARRAY(c->nz);
5299 
5300         PetscCall(PetscLogGpuTimeBegin());
5301         auto rT = CcsrT->row_offsets->begin();
5302         if (AT) {
5303           rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT);
5304           thrust::advance(rT,-1);
5305         }
5306         if (BT) {
5307           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz));
5308           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz));
5309           thrust::copy(titb,tite,rT);
5310         }
5311         auto cT = CcsrT->column_indices->begin();
5312         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT);
5313         if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT);
5314         auto vT = CcsrT->values->begin();
5315         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
5316         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
5317         PetscCall(PetscLogGpuTimeEnd());
5318 
5319         PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr));
5320         PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO));
5321         PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
5322         PetscCallCUDA(cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar)));
5323         PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar)));
5324         PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar)));
5325         PetscCallCUDA(cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
5326         PetscCallCUDA(cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice));
5327         PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
5328 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
5329         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries,
5330                                  CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(),
5331                                  CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
5332                                  CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat);
5333 #endif
5334         Ccusp->matTranspose = CmatT;
5335       }
5336     }
5337 
5338     c->singlemalloc = PETSC_FALSE;
5339     c->free_a       = PETSC_TRUE;
5340     c->free_ij      = PETSC_TRUE;
5341     PetscCall(PetscMalloc1(m+1,&c->i));
5342     PetscCall(PetscMalloc1(c->nz,&c->j));
5343     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
5344       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
5345       THRUSTINTARRAY jj(Ccsr->column_indices->size());
5346       ii   = *Ccsr->row_offsets;
5347       jj   = *Ccsr->column_indices;
5348       PetscCallCUDA(cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
5349       PetscCallCUDA(cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
5350     } else {
5351       PetscCallCUDA(cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
5352       PetscCallCUDA(cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
5353     }
5354     PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt)));
5355     PetscCall(PetscMalloc1(m,&c->ilen));
5356     PetscCall(PetscMalloc1(m,&c->imax));
5357     c->maxnz = c->nz;
5358     c->nonzerorowcnt = 0;
5359     c->rmax = 0;
5360     for (i = 0; i < m; i++) {
5361       const PetscInt nn = c->i[i+1] - c->i[i];
5362       c->ilen[i] = c->imax[i] = nn;
5363       c->nonzerorowcnt += (PetscInt)!!nn;
5364       c->rmax = PetscMax(c->rmax,nn);
5365     }
5366     PetscCall(MatMarkDiagonal_SeqAIJ(*C));
5367     PetscCall(PetscMalloc1(c->nz,&c->a));
5368     (*C)->nonzerostate++;
5369     PetscCall(PetscLayoutSetUp((*C)->rmap));
5370     PetscCall(PetscLayoutSetUp((*C)->cmap));
5371     Ccusp->nonzerostate = (*C)->nonzerostate;
5372     (*C)->preallocated  = PETSC_TRUE;
5373   } else {
5374     PetscCheck((*C)->rmap->n == B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,(*C)->rmap->n,B->rmap->n);
5375     c = (Mat_SeqAIJ*)(*C)->data;
5376     if (c->nz) {
5377       Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
5378       PetscCheck(Ccusp->cooPerm,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm");
5379       PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
5380       PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate,PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate");
5381       PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
5382       PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
5383       PetscCheck(Acusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
5384       PetscCheck(Bcusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
5385       Acsr = (CsrMatrix*)Acusp->mat->mat;
5386       Bcsr = (CsrMatrix*)Bcusp->mat->mat;
5387       Ccsr = (CsrMatrix*)Ccusp->mat->mat;
5388       PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %" PetscInt_FMT " != %" PetscInt_FMT,Acsr->num_entries,(PetscInt)Acsr->values->size());
5389       PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %" PetscInt_FMT " != %" PetscInt_FMT,Bcsr->num_entries,(PetscInt)Bcsr->values->size());
5390       PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %" PetscInt_FMT " != %" PetscInt_FMT,Ccsr->num_entries,(PetscInt)Ccsr->values->size());
5391       PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries,PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT,Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries);
5392       PetscCheck(Ccusp->cooPerm->size() == Ccsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %" PetscInt_FMT " != %" PetscInt_FMT,(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size());
5393       auto pmid = Ccusp->cooPerm->begin();
5394       thrust::advance(pmid,Acsr->num_entries);
5395       PetscCall(PetscLogGpuTimeBegin());
5396       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(),
5397                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin())));
5398       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(),
5399                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
5400       thrust::for_each(zibait,zieait,VecCUDAEquals());
5401       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(),
5402                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
5403       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(),
5404                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end())));
5405       thrust::for_each(zibbit,ziebit,VecCUDAEquals());
5406       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE));
5407       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
5408         PetscCheck(Ccusp->matTranspose,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
5409         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
5410         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
5411         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
5412         CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat;
5413         auto vT = CcsrT->values->begin();
5414         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
5415         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
5416         (*C)->transupdated = PETSC_TRUE;
5417       }
5418       PetscCall(PetscLogGpuTimeEnd());
5419     }
5420   }
5421   PetscCall(PetscObjectStateIncrease((PetscObject)*C));
5422   (*C)->assembled     = PETSC_TRUE;
5423   (*C)->was_assembled = PETSC_FALSE;
5424   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
5425   PetscFunctionReturn(0);
5426 }
5427 
5428 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
5429 {
5430   bool              dmem;
5431   const PetscScalar *av;
5432 
5433   PetscFunctionBegin;
5434   dmem = isCudaMem(v);
5435   PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A,&av));
5436   if (n && idx) {
5437     THRUSTINTARRAY widx(n);
5438     widx.assign(idx,idx+n);
5439     PetscCall(PetscLogCpuToGpu(n*sizeof(PetscInt)));
5440 
5441     THRUSTARRAY *w = NULL;
5442     thrust::device_ptr<PetscScalar> dv;
5443     if (dmem) {
5444       dv = thrust::device_pointer_cast(v);
5445     } else {
5446       w = new THRUSTARRAY(n);
5447       dv = w->data();
5448     }
5449     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
5450 
5451     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv));
5452     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n));
5453     thrust::for_each(zibit,zieit,VecCUDAEquals());
5454     if (w) {
5455       PetscCallCUDA(cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost));
5456     }
5457     delete w;
5458   } else {
5459     PetscCallCUDA(cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost));
5460   }
5461   if (!dmem) PetscCall(PetscLogCpuToGpu(n*sizeof(PetscScalar)));
5462   PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A,&av));
5463   PetscFunctionReturn(0);
5464 }
5465