xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 96b592737fa480036d2a4095db7e1939ed49daf5)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
7 
8 #include <petscconf.h>
9 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
10 #include <../src/mat/impls/sbaij/seq/sbaij.h>
11 #include <../src/vec/vec/impls/dvecimpl.h>
12 #include <petsc/private/vecimpl.h>
13 #undef VecType
14 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
15 #include <thrust/adjacent_difference.h>
16 #include <thrust/async/for_each.h>
17 #include <thrust/iterator/constant_iterator.h>
18 #include <thrust/remove.h>
19 #include <thrust/sort.h>
20 #include <thrust/unique.h>
21 
22 const char *const MatCUSPARSEStorageFormats[]    = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
23 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
24   /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
25     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
26 
27   typedef enum {
28       CUSPARSE_MV_ALG_DEFAULT = 0,
29       CUSPARSE_COOMV_ALG      = 1,
30       CUSPARSE_CSRMV_ALG1     = 2,
31       CUSPARSE_CSRMV_ALG2     = 3
32   } cusparseSpMVAlg_t;
33 
34   typedef enum {
35       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
36       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
37       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
38       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
39       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
40       CUSPARSE_SPMM_ALG_DEFAULT = 0,
41       CUSPARSE_SPMM_COO_ALG1    = 1,
42       CUSPARSE_SPMM_COO_ALG2    = 2,
43       CUSPARSE_SPMM_COO_ALG3    = 3,
44       CUSPARSE_SPMM_COO_ALG4    = 5,
45       CUSPARSE_SPMM_CSR_ALG1    = 4,
46       CUSPARSE_SPMM_CSR_ALG2    = 6,
47   } cusparseSpMMAlg_t;
48 
49   typedef enum {
50       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
51       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
52   } cusparseCsr2CscAlg_t;
53   */
54   const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
55   const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
56   const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
57 #endif
58 
59 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
60 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
61 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
62 
63 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
64 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
65 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
66 
67 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
68 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
69 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
70 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
71 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
72 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure);
73 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar);
74 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
75 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
76 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
77 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
78 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
79 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
80 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
81 
82 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
83 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
84 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
85 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
86 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
87 
88 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
89 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool);
90 
91 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]);
92 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscCount,const PetscInt[],const PetscInt[]);
93 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode);
94 
95 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
96 {
97   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
98 
99   PetscFunctionBegin;
100   switch (op) {
101   case MAT_CUSPARSE_MULT:
102     cusparsestruct->format = format;
103     break;
104   case MAT_CUSPARSE_ALL:
105     cusparsestruct->format = format;
106     break;
107   default:
108     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
109   }
110   PetscFunctionReturn(0);
111 }
112 
113 /*@
114    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
115    operation. Only the MatMult operation can use different GPU storage formats
116    for MPIAIJCUSPARSE matrices.
117    Not Collective
118 
119    Input Parameters:
120 +  A - Matrix of type SEQAIJCUSPARSE
121 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
122 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
123 
124    Output Parameter:
125 
126    Level: intermediate
127 
128 .seealso: `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
129 @*/
130 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
131 {
132   PetscFunctionBegin;
133   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
134   PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));
135   PetscFunctionReturn(0);
136 }
137 
138 PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A,PetscBool use_cpu)
139 {
140   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
141 
142   PetscFunctionBegin;
143   cusparsestruct->use_cpu_solve = use_cpu;
144   PetscFunctionReturn(0);
145 }
146 
147 /*@
148    MatCUSPARSESetUseCPUSolve - Sets use CPU MatSolve.
149 
150    Input Parameters:
151 +  A - Matrix of type SEQAIJCUSPARSE
152 -  use_cpu - set flag for using the built-in CPU MatSolve
153 
154    Output Parameter:
155 
156    Notes:
157    The cuSparse LU solver currently computes the factors with the built-in CPU method
158    and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there.
159    This method to specify if the solve is done on the CPU or GPU (GPU is the default).
160 
161    Level: intermediate
162 
163 .seealso: `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
164 @*/
165 PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A,PetscBool use_cpu)
166 {
167   PetscFunctionBegin;
168   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
169   PetscTryMethod(A,"MatCUSPARSESetUseCPUSolve_C",(Mat,PetscBool),(A,use_cpu));
170   PetscFunctionReturn(0);
171 }
172 
173 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg)
174 {
175   PetscFunctionBegin;
176   switch (op) {
177     case MAT_FORM_EXPLICIT_TRANSPOSE:
178       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
179       if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE));
180       A->form_explicit_transpose = flg;
181       break;
182     default:
183       PetscCall(MatSetOption_SeqAIJ(A,op,flg));
184       break;
185   }
186   PetscFunctionReturn(0);
187 }
188 
189 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A);
190 
191 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
192 {
193   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
194   IS             isrow = b->row,iscol = b->col;
195   PetscBool      row_identity,col_identity;
196   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)B->spptr;
197 
198   PetscFunctionBegin;
199   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
200   PetscCall(MatLUFactorNumeric_SeqAIJ(B,A,info));
201   B->offloadmask = PETSC_OFFLOAD_CPU;
202   /* determine which version of MatSolve needs to be used. */
203   PetscCall(ISIdentity(isrow,&row_identity));
204   PetscCall(ISIdentity(iscol,&col_identity));
205 
206   if (!cusparsestruct->use_cpu_solve) {
207     if (row_identity && col_identity) {
208       B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
209       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
210     } else {
211       B->ops->solve = MatSolve_SeqAIJCUSPARSE;
212       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
213     }
214   }
215   B->ops->matsolve = NULL;
216   B->ops->matsolvetranspose = NULL;
217 
218   /* get the triangular factors */
219   if (!cusparsestruct->use_cpu_solve) {
220     PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B));
221   }
222   PetscFunctionReturn(0);
223 }
224 
225 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
226 {
227   MatCUSPARSEStorageFormat format;
228   PetscBool                flg;
229   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
230 
231   PetscFunctionBegin;
232   PetscOptionsHeadBegin(PetscOptionsObject,"SeqAIJCUSPARSE options");
233   if (A->factortype == MAT_FACTOR_NONE) {
234     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
235                                "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg));
236     if (flg) PetscCall(MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format));
237 
238     PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
239                                "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg));
240     if (flg) PetscCall(MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format));
241     PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve","Use CPU (I)LU solve","MatCUSPARSESetUseCPUSolve",cusparsestruct->use_cpu_solve,&cusparsestruct->use_cpu_solve,&flg));
242     if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A,cusparsestruct->use_cpu_solve));
243 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
244     PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
245                                "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg));
246     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
247 #if CUSPARSE_VERSION > 11301
248     PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
249 #else
250     PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
251 #endif
252     PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
253                                "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg));
254     PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
255 
256     PetscCall(PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
257                                "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg));
258     PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
259    #endif
260   }
261   PetscOptionsHeadEnd();
262   PetscFunctionReturn(0);
263 }
264 
265 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
266 {
267   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
268   PetscInt                          n = A->rmap->n;
269   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
270   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
271   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
272   const MatScalar                   *aa = a->a,*v;
273   PetscInt                          *AiLo, *AjLo;
274   PetscInt                          i,nz, nzLower, offset, rowOffset;
275 
276   PetscFunctionBegin;
277   if (!n) PetscFunctionReturn(0);
278   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
279     try {
280       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
281       nzLower=n+ai[n]-ai[1];
282       if (!loTriFactor) {
283         PetscScalar                       *AALo;
284 
285         PetscCallCUDA(cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar)));
286 
287         /* Allocate Space for the lower triangular matrix */
288         PetscCallCUDA(cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt)));
289         PetscCallCUDA(cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt)));
290 
291         /* Fill the lower triangular matrix */
292         AiLo[0]  = (PetscInt) 0;
293         AiLo[n]  = nzLower;
294         AjLo[0]  = (PetscInt) 0;
295         AALo[0]  = (MatScalar) 1.0;
296         v        = aa;
297         vi       = aj;
298         offset   = 1;
299         rowOffset= 1;
300         for (i=1; i<n; i++) {
301           nz = ai[i+1] - ai[i];
302           /* additional 1 for the term on the diagonal */
303           AiLo[i]    = rowOffset;
304           rowOffset += nz+1;
305 
306           PetscCall(PetscArraycpy(&(AjLo[offset]), vi, nz));
307           PetscCall(PetscArraycpy(&(AALo[offset]), v, nz));
308 
309           offset      += nz;
310           AjLo[offset] = (PetscInt) i;
311           AALo[offset] = (MatScalar) 1.0;
312           offset      += 1;
313 
314           v  += nz;
315           vi += nz;
316         }
317 
318         /* allocate space for the triangular factor information */
319         PetscCall(PetscNew(&loTriFactor));
320         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
321         /* Create the matrix description */
322         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
323         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
324        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
325         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
326        #else
327         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
328        #endif
329         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER));
330         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
331 
332         /* set the operation */
333         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
334 
335         /* set the matrix */
336         loTriFactor->csrMat = new CsrMatrix;
337         loTriFactor->csrMat->num_rows = n;
338         loTriFactor->csrMat->num_cols = n;
339         loTriFactor->csrMat->num_entries = nzLower;
340 
341         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
342         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
343 
344         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
345         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
346 
347         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
348         loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
349 
350         /* Create the solve analysis information */
351         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
352         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
353       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
354         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
355                                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
356                                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
357                                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
358                                                &loTriFactor->solveBufferSize));
359         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize));
360       #endif
361 
362         /* perform the solve analysis */
363         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
364                                          loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
365                                          loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
366                                          loTriFactor->csrMat->column_indices->data().get(),
367                                          #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
368                                          loTriFactor->solveInfo,
369                                          loTriFactor->solvePolicy, loTriFactor->solveBuffer));
370                                          #else
371                                          loTriFactor->solveInfo));
372                                          #endif
373         PetscCallCUDA(WaitForCUDA());
374         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
375 
376         /* assign the pointer */
377         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
378         loTriFactor->AA_h = AALo;
379         PetscCallCUDA(cudaFreeHost(AiLo));
380         PetscCallCUDA(cudaFreeHost(AjLo));
381         PetscCall(PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar)));
382       } else { /* update values only */
383         if (!loTriFactor->AA_h) {
384           PetscCallCUDA(cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar)));
385         }
386         /* Fill the lower triangular matrix */
387         loTriFactor->AA_h[0]  = 1.0;
388         v        = aa;
389         vi       = aj;
390         offset   = 1;
391         for (i=1; i<n; i++) {
392           nz = ai[i+1] - ai[i];
393           PetscCall(PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz));
394           offset      += nz;
395           loTriFactor->AA_h[offset] = 1.0;
396           offset      += 1;
397           v  += nz;
398         }
399         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower);
400         PetscCall(PetscLogCpuToGpu(nzLower*sizeof(PetscScalar)));
401       }
402     } catch(char *ex) {
403       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
404     }
405   }
406   PetscFunctionReturn(0);
407 }
408 
409 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
410 {
411   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
412   PetscInt                          n = A->rmap->n;
413   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
414   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
415   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
416   const MatScalar                   *aa = a->a,*v;
417   PetscInt                          *AiUp, *AjUp;
418   PetscInt                          i,nz, nzUpper, offset;
419 
420   PetscFunctionBegin;
421   if (!n) PetscFunctionReturn(0);
422   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
423     try {
424       /* next, figure out the number of nonzeros in the upper triangular matrix. */
425       nzUpper = adiag[0]-adiag[n];
426       if (!upTriFactor) {
427         PetscScalar *AAUp;
428 
429         PetscCallCUDA(cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar)));
430 
431         /* Allocate Space for the upper triangular matrix */
432         PetscCallCUDA(cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt)));
433         PetscCallCUDA(cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt)));
434 
435         /* Fill the upper triangular matrix */
436         AiUp[0]=(PetscInt) 0;
437         AiUp[n]=nzUpper;
438         offset = nzUpper;
439         for (i=n-1; i>=0; i--) {
440           v  = aa + adiag[i+1] + 1;
441           vi = aj + adiag[i+1] + 1;
442 
443           /* number of elements NOT on the diagonal */
444           nz = adiag[i] - adiag[i+1]-1;
445 
446           /* decrement the offset */
447           offset -= (nz+1);
448 
449           /* first, set the diagonal elements */
450           AjUp[offset] = (PetscInt) i;
451           AAUp[offset] = (MatScalar)1./v[nz];
452           AiUp[i]      = AiUp[i+1] - (nz+1);
453 
454           PetscCall(PetscArraycpy(&(AjUp[offset+1]), vi, nz));
455           PetscCall(PetscArraycpy(&(AAUp[offset+1]), v, nz));
456         }
457 
458         /* allocate space for the triangular factor information */
459         PetscCall(PetscNew(&upTriFactor));
460         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
461 
462         /* Create the matrix description */
463         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
464         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
465        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
466         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
467        #else
468         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
469        #endif
470         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
471         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
472 
473         /* set the operation */
474         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
475 
476         /* set the matrix */
477         upTriFactor->csrMat = new CsrMatrix;
478         upTriFactor->csrMat->num_rows = n;
479         upTriFactor->csrMat->num_cols = n;
480         upTriFactor->csrMat->num_entries = nzUpper;
481 
482         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
483         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
484 
485         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
486         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
487 
488         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
489         upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
490 
491         /* Create the solve analysis information */
492         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
493         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
494       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
495         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
496                                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
497                                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
498                                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
499                                                &upTriFactor->solveBufferSize));
500         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize));
501       #endif
502 
503         /* perform the solve analysis */
504         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
505                                          upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
506                                          upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
507                                          upTriFactor->csrMat->column_indices->data().get(),
508                                          #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
509                                          upTriFactor->solveInfo,
510                                          upTriFactor->solvePolicy, upTriFactor->solveBuffer));
511                                          #else
512                                          upTriFactor->solveInfo));
513                                          #endif
514         PetscCallCUDA(WaitForCUDA());
515         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
516 
517         /* assign the pointer */
518         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
519         upTriFactor->AA_h = AAUp;
520         PetscCallCUDA(cudaFreeHost(AiUp));
521         PetscCallCUDA(cudaFreeHost(AjUp));
522         PetscCall(PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar)));
523       } else {
524         if (!upTriFactor->AA_h) {
525           PetscCallCUDA(cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar)));
526         }
527         /* Fill the upper triangular matrix */
528         offset = nzUpper;
529         for (i=n-1; i>=0; i--) {
530           v  = aa + adiag[i+1] + 1;
531 
532           /* number of elements NOT on the diagonal */
533           nz = adiag[i] - adiag[i+1]-1;
534 
535           /* decrement the offset */
536           offset -= (nz+1);
537 
538           /* first, set the diagonal elements */
539           upTriFactor->AA_h[offset] = 1./v[nz];
540           PetscCall(PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz));
541         }
542         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper);
543         PetscCall(PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar)));
544       }
545     } catch(char *ex) {
546       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
547     }
548   }
549   PetscFunctionReturn(0);
550 }
551 
552 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
553 {
554   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
555   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
556   IS                           isrow = a->row,iscol = a->icol;
557   PetscBool                    row_identity,col_identity;
558   PetscInt                     n = A->rmap->n;
559 
560   PetscFunctionBegin;
561   PetscCheck(cusparseTriFactors,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
562   PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A));
563   PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A));
564 
565   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
566   cusparseTriFactors->nnz=a->nz;
567 
568   A->offloadmask = PETSC_OFFLOAD_BOTH;
569   /* lower triangular indices */
570   PetscCall(ISIdentity(isrow,&row_identity));
571   if (!row_identity && !cusparseTriFactors->rpermIndices) {
572     const PetscInt *r;
573 
574     PetscCall(ISGetIndices(isrow,&r));
575     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
576     cusparseTriFactors->rpermIndices->assign(r, r+n);
577     PetscCall(ISRestoreIndices(isrow,&r));
578     PetscCall(PetscLogCpuToGpu(n*sizeof(PetscInt)));
579   }
580 
581   /* upper triangular indices */
582   PetscCall(ISIdentity(iscol,&col_identity));
583   if (!col_identity && !cusparseTriFactors->cpermIndices) {
584     const PetscInt *c;
585 
586     PetscCall(ISGetIndices(iscol,&c));
587     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
588     cusparseTriFactors->cpermIndices->assign(c, c+n);
589     PetscCall(ISRestoreIndices(iscol,&c));
590     PetscCall(PetscLogCpuToGpu(n*sizeof(PetscInt)));
591   }
592   PetscFunctionReturn(0);
593 }
594 
595 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
596 {
597   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
598   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
599   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
600   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
601   PetscInt                          *AiUp, *AjUp;
602   PetscScalar                       *AAUp;
603   PetscScalar                       *AALo;
604   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
605   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
606   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
607   const MatScalar                   *aa = b->a,*v;
608 
609   PetscFunctionBegin;
610   if (!n) PetscFunctionReturn(0);
611   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
612     try {
613       PetscCallCUDA(cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar)));
614       PetscCallCUDA(cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar)));
615       if (!upTriFactor && !loTriFactor) {
616         /* Allocate Space for the upper triangular matrix */
617         PetscCallCUDA(cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt)));
618         PetscCallCUDA(cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt)));
619 
620         /* Fill the upper triangular matrix */
621         AiUp[0]=(PetscInt) 0;
622         AiUp[n]=nzUpper;
623         offset = 0;
624         for (i=0; i<n; i++) {
625           /* set the pointers */
626           v  = aa + ai[i];
627           vj = aj + ai[i];
628           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
629 
630           /* first, set the diagonal elements */
631           AjUp[offset] = (PetscInt) i;
632           AAUp[offset] = (MatScalar)1.0/v[nz];
633           AiUp[i]      = offset;
634           AALo[offset] = (MatScalar)1.0/v[nz];
635 
636           offset+=1;
637           if (nz>0) {
638             PetscCall(PetscArraycpy(&(AjUp[offset]), vj, nz));
639             PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz));
640             for (j=offset; j<offset+nz; j++) {
641               AAUp[j] = -AAUp[j];
642               AALo[j] = AAUp[j]/v[nz];
643             }
644             offset+=nz;
645           }
646         }
647 
648         /* allocate space for the triangular factor information */
649         PetscCall(PetscNew(&upTriFactor));
650         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
651 
652         /* Create the matrix description */
653         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
654         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
655        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
656         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
657        #else
658         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
659        #endif
660         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
661         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
662 
663         /* set the matrix */
664         upTriFactor->csrMat = new CsrMatrix;
665         upTriFactor->csrMat->num_rows = A->rmap->n;
666         upTriFactor->csrMat->num_cols = A->cmap->n;
667         upTriFactor->csrMat->num_entries = a->nz;
668 
669         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
670         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
671 
672         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
673         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
674 
675         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
676         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
677 
678         /* set the operation */
679         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
680 
681         /* Create the solve analysis information */
682         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
683         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
684       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
685         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
686                                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
687                                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
688                                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
689                                                &upTriFactor->solveBufferSize));
690         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize));
691       #endif
692 
693         /* perform the solve analysis */
694         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
695                                          upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
696                                          upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
697                                          upTriFactor->csrMat->column_indices->data().get(),
698                                          #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
699                                          upTriFactor->solveInfo,
700                                          upTriFactor->solvePolicy, upTriFactor->solveBuffer));
701                                          #else
702                                          upTriFactor->solveInfo));
703                                          #endif
704         PetscCallCUDA(WaitForCUDA());
705         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
706 
707         /* assign the pointer */
708         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
709 
710         /* allocate space for the triangular factor information */
711         PetscCall(PetscNew(&loTriFactor));
712         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
713 
714         /* Create the matrix description */
715         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
716         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
717        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
718         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
719        #else
720         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
721        #endif
722         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
723         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
724 
725         /* set the operation */
726         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
727 
728         /* set the matrix */
729         loTriFactor->csrMat = new CsrMatrix;
730         loTriFactor->csrMat->num_rows = A->rmap->n;
731         loTriFactor->csrMat->num_cols = A->cmap->n;
732         loTriFactor->csrMat->num_entries = a->nz;
733 
734         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
735         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
736 
737         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
738         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
739 
740         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
741         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
742 
743         /* Create the solve analysis information */
744         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
745         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
746       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
747         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
748                                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
749                                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
750                                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
751                                                &loTriFactor->solveBufferSize));
752         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize));
753       #endif
754 
755         /* perform the solve analysis */
756         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
757                                          loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
758                                          loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
759                                          loTriFactor->csrMat->column_indices->data().get(),
760                                          #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
761                                          loTriFactor->solveInfo,
762                                          loTriFactor->solvePolicy, loTriFactor->solveBuffer));
763                                          #else
764                                          loTriFactor->solveInfo));
765                                          #endif
766         PetscCallCUDA(WaitForCUDA());
767         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
768 
769         /* assign the pointer */
770         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
771 
772         PetscCall(PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar))));
773         PetscCallCUDA(cudaFreeHost(AiUp));
774         PetscCallCUDA(cudaFreeHost(AjUp));
775       } else {
776         /* Fill the upper triangular matrix */
777         offset = 0;
778         for (i=0; i<n; i++) {
779           /* set the pointers */
780           v  = aa + ai[i];
781           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
782 
783           /* first, set the diagonal elements */
784           AAUp[offset] = 1.0/v[nz];
785           AALo[offset] = 1.0/v[nz];
786 
787           offset+=1;
788           if (nz>0) {
789             PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz));
790             for (j=offset; j<offset+nz; j++) {
791               AAUp[j] = -AAUp[j];
792               AALo[j] = AAUp[j]/v[nz];
793             }
794             offset+=nz;
795           }
796         }
797         PetscCheck(upTriFactor,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
798         PetscCheck(loTriFactor,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
799         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
800         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
801         PetscCall(PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar)));
802       }
803       PetscCallCUDA(cudaFreeHost(AAUp));
804       PetscCallCUDA(cudaFreeHost(AALo));
805     } catch(char *ex) {
806       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
807     }
808   }
809   PetscFunctionReturn(0);
810 }
811 
812 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
813 {
814   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
815   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
816   IS                           ip = a->row;
817   PetscBool                    perm_identity;
818   PetscInt                     n = A->rmap->n;
819 
820   PetscFunctionBegin;
821   PetscCheck(cusparseTriFactors,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
822   PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A));
823   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
824   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
825 
826   A->offloadmask = PETSC_OFFLOAD_BOTH;
827 
828   /* lower triangular indices */
829   PetscCall(ISIdentity(ip,&perm_identity));
830   if (!perm_identity) {
831     IS             iip;
832     const PetscInt *irip,*rip;
833 
834     PetscCall(ISInvertPermutation(ip,PETSC_DECIDE,&iip));
835     PetscCall(ISGetIndices(iip,&irip));
836     PetscCall(ISGetIndices(ip,&rip));
837     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
838     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
839     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
840     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
841     PetscCall(ISRestoreIndices(iip,&irip));
842     PetscCall(ISDestroy(&iip));
843     PetscCall(ISRestoreIndices(ip,&rip));
844     PetscCall(PetscLogCpuToGpu(2.*n*sizeof(PetscInt)));
845   }
846   PetscFunctionReturn(0);
847 }
848 
849 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
850 {
851   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
852   IS             ip = b->row;
853   PetscBool      perm_identity;
854 
855   PetscFunctionBegin;
856   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
857   PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B,A,info));
858   B->offloadmask = PETSC_OFFLOAD_CPU;
859   /* determine which version of MatSolve needs to be used. */
860   PetscCall(ISIdentity(ip,&perm_identity));
861   if (perm_identity) {
862     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
863     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
864     B->ops->matsolve = NULL;
865     B->ops->matsolvetranspose = NULL;
866   } else {
867     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
868     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
869     B->ops->matsolve = NULL;
870     B->ops->matsolvetranspose = NULL;
871   }
872 
873   /* get the triangular factors */
874   PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B));
875   PetscFunctionReturn(0);
876 }
877 
878 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
879 {
880   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
881   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
882   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
883   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
884   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
885   cusparseIndexBase_t               indexBase;
886   cusparseMatrixType_t              matrixType;
887   cusparseFillMode_t                fillMode;
888   cusparseDiagType_t                diagType;
889 
890   PetscFunctionBegin;
891   /* allocate space for the transpose of the lower triangular factor */
892   PetscCall(PetscNew(&loTriFactorT));
893   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
894 
895   /* set the matrix descriptors of the lower triangular factor */
896   matrixType = cusparseGetMatType(loTriFactor->descr);
897   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
898   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
899     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
900   diagType = cusparseGetMatDiagType(loTriFactor->descr);
901 
902   /* Create the matrix description */
903   PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr));
904   PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase));
905   PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType));
906   PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode));
907   PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType));
908 
909   /* set the operation */
910   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
911 
912   /* allocate GPU space for the CSC of the lower triangular factor*/
913   loTriFactorT->csrMat = new CsrMatrix;
914   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
915   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
916   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
917   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
918   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
919   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
920 
921   /* compute the transpose of the lower triangular factor, i.e. the CSC */
922 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
923   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
924                                                loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
925                                                loTriFactor->csrMat->values->data().get(),
926                                                loTriFactor->csrMat->row_offsets->data().get(),
927                                                loTriFactor->csrMat->column_indices->data().get(),
928                                                loTriFactorT->csrMat->values->data().get(),
929                                                loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
930                                                CUSPARSE_ACTION_NUMERIC,indexBase,
931                                                CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize));
932   PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize));
933 #endif
934 
935   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
936   PetscCallCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
937                                   loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
938                                   loTriFactor->csrMat->values->data().get(),
939                                   loTriFactor->csrMat->row_offsets->data().get(),
940                                   loTriFactor->csrMat->column_indices->data().get(),
941                                   loTriFactorT->csrMat->values->data().get(),
942                                   #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
943                                   loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
944                                   CUSPARSE_ACTION_NUMERIC, indexBase,
945                                   CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer));
946                                   #else
947                                   loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
948                                   CUSPARSE_ACTION_NUMERIC, indexBase));
949                                   #endif
950   PetscCallCUDA(WaitForCUDA());
951   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
952 
953   /* Create the solve analysis information */
954   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
955   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactorT->solveInfo));
956 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
957   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
958                                          loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
959                                          loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
960                                          loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
961                                          &loTriFactorT->solveBufferSize));
962   PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize));
963 #endif
964 
965   /* perform the solve analysis */
966   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
967                                    loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
968                                    loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
969                                    loTriFactorT->csrMat->column_indices->data().get(),
970                                    #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
971                                    loTriFactorT->solveInfo,
972                                    loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
973                                    #else
974                                    loTriFactorT->solveInfo));
975                                    #endif
976   PetscCallCUDA(WaitForCUDA());
977   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
978 
979   /* assign the pointer */
980   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
981 
982   /*********************************************/
983   /* Now the Transpose of the Upper Tri Factor */
984   /*********************************************/
985 
986   /* allocate space for the transpose of the upper triangular factor */
987   PetscCall(PetscNew(&upTriFactorT));
988   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
989 
990   /* set the matrix descriptors of the upper triangular factor */
991   matrixType = cusparseGetMatType(upTriFactor->descr);
992   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
993   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
994     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
995   diagType = cusparseGetMatDiagType(upTriFactor->descr);
996 
997   /* Create the matrix description */
998   PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr));
999   PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase));
1000   PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType));
1001   PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode));
1002   PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType));
1003 
1004   /* set the operation */
1005   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1006 
1007   /* allocate GPU space for the CSC of the upper triangular factor*/
1008   upTriFactorT->csrMat = new CsrMatrix;
1009   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1010   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1011   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1012   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1013   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1014   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1015 
1016   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1017 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1018   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1019                                                upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1020                                                upTriFactor->csrMat->values->data().get(),
1021                                                upTriFactor->csrMat->row_offsets->data().get(),
1022                                                upTriFactor->csrMat->column_indices->data().get(),
1023                                                upTriFactorT->csrMat->values->data().get(),
1024                                                upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1025                                                CUSPARSE_ACTION_NUMERIC,indexBase,
1026                                                CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize));
1027   PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize));
1028 #endif
1029 
1030   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
1031   PetscCallCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1032                                   upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1033                                   upTriFactor->csrMat->values->data().get(),
1034                                   upTriFactor->csrMat->row_offsets->data().get(),
1035                                   upTriFactor->csrMat->column_indices->data().get(),
1036                                   upTriFactorT->csrMat->values->data().get(),
1037                                   #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1038                                   upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1039                                   CUSPARSE_ACTION_NUMERIC, indexBase,
1040                                   CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer));
1041                                   #else
1042                                   upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1043                                  CUSPARSE_ACTION_NUMERIC, indexBase));
1044                                  #endif
1045 
1046   PetscCallCUDA(WaitForCUDA());
1047   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
1048 
1049   /* Create the solve analysis information */
1050   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
1051   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactorT->solveInfo));
1052   #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1053   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1054                                          upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1055                                          upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1056                                          upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
1057                                          &upTriFactorT->solveBufferSize));
1058   PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize));
1059   #endif
1060 
1061   /* perform the solve analysis */
1062   /* christ, would it have killed you to put this stuff in a function????????? */
1063   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1064                                    upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1065                                    upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1066                                    upTriFactorT->csrMat->column_indices->data().get(),
1067                                    #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1068                                    upTriFactorT->solveInfo,
1069                                    upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1070                                    #else
1071                                    upTriFactorT->solveInfo));
1072                                    #endif
1073 
1074   PetscCallCUDA(WaitForCUDA());
1075   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
1076 
1077   /* assign the pointer */
1078   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1079   PetscFunctionReturn(0);
1080 }
1081 
1082 struct PetscScalarToPetscInt
1083 {
1084   __host__ __device__
1085   PetscInt operator()(PetscScalar s)
1086   {
1087     return (PetscInt)PetscRealPart(s);
1088   }
1089 };
1090 
1091 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A)
1092 {
1093   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1094   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1095   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1096   cusparseStatus_t             stat;
1097   cusparseIndexBase_t          indexBase;
1098 
1099   PetscFunctionBegin;
1100   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1101   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1102   PetscCheck(matstruct,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing mat struct");
1103   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1104   PetscCheck(!A->transupdated || matstructT,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing matTranspose struct");
1105   if (A->transupdated) PetscFunctionReturn(0);
1106   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
1107   PetscCall(PetscLogGpuTimeBegin());
1108   if (cusparsestruct->format != MAT_CUSPARSE_CSR) {
1109     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE));
1110   }
1111   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1112     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1113     PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr));
1114     indexBase = cusparseGetMatIndexBase(matstruct->descr);
1115     PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase));
1116     PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
1117 
1118     /* set alpha and beta */
1119     PetscCallCUDA(cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar)));
1120     PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar)));
1121     PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar)));
1122     PetscCallCUDA(cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
1123     PetscCallCUDA(cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice));
1124     PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
1125 
1126     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1127       CsrMatrix *matrixT = new CsrMatrix;
1128       matstructT->mat = matrixT;
1129       matrixT->num_rows = A->cmap->n;
1130       matrixT->num_cols = A->rmap->n;
1131       matrixT->num_entries = a->nz;
1132       matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1133       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1134       matrixT->values = new THRUSTARRAY(a->nz);
1135 
1136       if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); }
1137       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
1138 
1139      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1140       #if PETSC_PKG_CUDA_VERSION_GE(11,2,1)
1141         stat = cusparseCreateCsr(&matstructT->matDescr,
1142                                matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1143                                matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1144                                matrixT->values->data().get(),
1145                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1146                                indexBase,cusparse_scalartype);PetscCallCUSPARSE(stat);
1147       #else
1148         /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1,
1149            see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1
1150 
1151            I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set
1152            it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2,
1153            when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly.
1154         */
1155         if (matrixT->num_entries) {
1156           stat = cusparseCreateCsr(&matstructT->matDescr,
1157                                  matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1158                                  matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1159                                  matrixT->values->data().get(),
1160                                  CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I,
1161                                  indexBase,cusparse_scalartype);PetscCallCUSPARSE(stat);
1162 
1163         } else {
1164           matstructT->matDescr = NULL;
1165           matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
1166         }
1167       #endif
1168      #endif
1169     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1170    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1171       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1172    #else
1173       CsrMatrix *temp  = new CsrMatrix;
1174       CsrMatrix *tempT = new CsrMatrix;
1175       /* First convert HYB to CSR */
1176       temp->num_rows = A->rmap->n;
1177       temp->num_cols = A->cmap->n;
1178       temp->num_entries = a->nz;
1179       temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1180       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1181       temp->values = new THRUSTARRAY(a->nz);
1182 
1183       stat = cusparse_hyb2csr(cusparsestruct->handle,
1184                               matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1185                               temp->values->data().get(),
1186                               temp->row_offsets->data().get(),
1187                               temp->column_indices->data().get());PetscCallCUSPARSE(stat);
1188 
1189       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1190       tempT->num_rows = A->rmap->n;
1191       tempT->num_cols = A->cmap->n;
1192       tempT->num_entries = a->nz;
1193       tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1194       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1195       tempT->values = new THRUSTARRAY(a->nz);
1196 
1197       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1198                               temp->num_cols, temp->num_entries,
1199                               temp->values->data().get(),
1200                               temp->row_offsets->data().get(),
1201                               temp->column_indices->data().get(),
1202                               tempT->values->data().get(),
1203                               tempT->column_indices->data().get(),
1204                               tempT->row_offsets->data().get(),
1205                               CUSPARSE_ACTION_NUMERIC, indexBase);PetscCallCUSPARSE(stat);
1206 
1207       /* Last, convert CSC to HYB */
1208       cusparseHybMat_t hybMat;
1209       PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
1210       cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1211         CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1212       stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1213                               matstructT->descr, tempT->values->data().get(),
1214                               tempT->row_offsets->data().get(),
1215                               tempT->column_indices->data().get(),
1216                               hybMat, 0, partition);PetscCallCUSPARSE(stat);
1217 
1218       /* assign the pointer */
1219       matstructT->mat = hybMat;
1220       A->transupdated = PETSC_TRUE;
1221       /* delete temporaries */
1222       if (tempT) {
1223         if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1224         if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1225         if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1226         delete (CsrMatrix*) tempT;
1227       }
1228       if (temp) {
1229         if (temp->values) delete (THRUSTARRAY*) temp->values;
1230         if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1231         if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1232         delete (CsrMatrix*) temp;
1233       }
1234      #endif
1235     }
1236   }
1237   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1238     CsrMatrix *matrix  = (CsrMatrix*)matstruct->mat;
1239     CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat;
1240     PetscCheck(matrix,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix");
1241     PetscCheck(matrix->row_offsets,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix rows");
1242     PetscCheck(matrix->column_indices,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix cols");
1243     PetscCheck(matrix->values,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix values");
1244     PetscCheck(matrixT,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT");
1245     PetscCheck(matrixT->row_offsets,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT rows");
1246     PetscCheck(matrixT->column_indices,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT cols");
1247     PetscCheck(matrixT->values,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT values");
1248     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1249       cusparsestruct->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
1250       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
1251       PetscCall(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt)));
1252     }
1253     if (!cusparsestruct->csr2csc_i) {
1254       THRUSTARRAY csr2csc_a(matrix->num_entries);
1255       PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1256 
1257       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1258      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1259       void   *csr2cscBuffer;
1260       size_t csr2cscBufferSize;
1261       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1262                                            A->cmap->n, matrix->num_entries,
1263                                            matrix->values->data().get(),
1264                                            cusparsestruct->rowoffsets_gpu->data().get(),
1265                                            matrix->column_indices->data().get(),
1266                                            matrixT->values->data().get(),
1267                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1268                                            CUSPARSE_ACTION_NUMERIC,indexBase,
1269                                            cusparsestruct->csr2cscAlg, &csr2cscBufferSize);PetscCallCUSPARSE(stat);
1270       PetscCallCUDA(cudaMalloc(&csr2cscBuffer,csr2cscBufferSize));
1271      #endif
1272 
1273       if (matrix->num_entries) {
1274         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
1275            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
1276            I checked every parameters and they were just fine. I have no clue why cusparse complains.
1277 
1278            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
1279            should be filled with indexBase. So I just take a shortcut here.
1280         */
1281         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1282                               A->cmap->n,matrix->num_entries,
1283                               csr2csc_a.data().get(),
1284                               cusparsestruct->rowoffsets_gpu->data().get(),
1285                               matrix->column_indices->data().get(),
1286                               matrixT->values->data().get(),
1287                              #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1288                               matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1289                               CUSPARSE_ACTION_NUMERIC,indexBase,
1290                               cusparsestruct->csr2cscAlg, csr2cscBuffer);PetscCallCUSPARSE(stat);
1291                              #else
1292                               matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1293                               CUSPARSE_ACTION_NUMERIC, indexBase);PetscCallCUSPARSE(stat);
1294                              #endif
1295       } else {
1296         matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
1297       }
1298 
1299       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1300       PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt()));
1301      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1302       PetscCallCUDA(cudaFree(csr2cscBuffer));
1303      #endif
1304     }
1305     PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()),
1306                                                      thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()),
1307                                                      matrixT->values->begin()));
1308   }
1309   PetscCall(PetscLogGpuTimeEnd());
1310   PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
1311   /* the compressed row indices is not used for matTranspose */
1312   matstructT->cprowIndices = NULL;
1313   /* assign the pointer */
1314   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1315   A->transupdated = PETSC_TRUE;
1316   PetscFunctionReturn(0);
1317 }
1318 
1319 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1320 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1321 {
1322   PetscInt                              n = xx->map->n;
1323   const PetscScalar                     *barray;
1324   PetscScalar                           *xarray;
1325   thrust::device_ptr<const PetscScalar> bGPU;
1326   thrust::device_ptr<PetscScalar>       xGPU;
1327   cusparseStatus_t                      stat;
1328   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1329   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1330   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1331   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1332 
1333   PetscFunctionBegin;
1334   /* Analyze the matrix and create the transpose ... on the fly */
1335   if (!loTriFactorT && !upTriFactorT) {
1336     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1337     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1338     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1339   }
1340 
1341   /* Get the GPU pointers */
1342   PetscCall(VecCUDAGetArrayWrite(xx,&xarray));
1343   PetscCall(VecCUDAGetArrayRead(bb,&barray));
1344   xGPU = thrust::device_pointer_cast(xarray);
1345   bGPU = thrust::device_pointer_cast(barray);
1346 
1347   PetscCall(PetscLogGpuTimeBegin());
1348   /* First, reorder with the row permutation */
1349   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1350                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1351                xGPU);
1352 
1353   /* First, solve U */
1354   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1355                         upTriFactorT->csrMat->num_rows,
1356                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1357                         upTriFactorT->csrMat->num_entries,
1358                       #endif
1359                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1360                         upTriFactorT->csrMat->values->data().get(),
1361                         upTriFactorT->csrMat->row_offsets->data().get(),
1362                         upTriFactorT->csrMat->column_indices->data().get(),
1363                         upTriFactorT->solveInfo,
1364                         xarray,
1365                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1366                         tempGPU->data().get(),
1367                         upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);PetscCallCUSPARSE(stat);
1368                       #else
1369                         tempGPU->data().get());PetscCallCUSPARSE(stat);
1370                       #endif
1371 
1372   /* Then, solve L */
1373   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1374                         loTriFactorT->csrMat->num_rows,
1375                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1376                         loTriFactorT->csrMat->num_entries,
1377                       #endif
1378                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1379                         loTriFactorT->csrMat->values->data().get(),
1380                         loTriFactorT->csrMat->row_offsets->data().get(),
1381                         loTriFactorT->csrMat->column_indices->data().get(),
1382                         loTriFactorT->solveInfo,
1383                         tempGPU->data().get(),
1384                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1385                         xarray,
1386                         loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);PetscCallCUSPARSE(stat);
1387                       #else
1388                          xarray);PetscCallCUSPARSE(stat);
1389                       #endif
1390 
1391   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1392   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1393                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1394                tempGPU->begin());
1395 
1396   /* Copy the temporary to the full solution. */
1397   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU);
1398 
1399   /* restore */
1400   PetscCall(VecCUDARestoreArrayRead(bb,&barray));
1401   PetscCall(VecCUDARestoreArrayWrite(xx,&xarray));
1402   PetscCall(PetscLogGpuTimeEnd());
1403   PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n));
1404   PetscFunctionReturn(0);
1405 }
1406 
1407 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1408 {
1409   const PetscScalar                 *barray;
1410   PetscScalar                       *xarray;
1411   cusparseStatus_t                  stat;
1412   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1413   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1414   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1415   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1416 
1417   PetscFunctionBegin;
1418   /* Analyze the matrix and create the transpose ... on the fly */
1419   if (!loTriFactorT && !upTriFactorT) {
1420     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1421     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1422     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1423   }
1424 
1425   /* Get the GPU pointers */
1426   PetscCall(VecCUDAGetArrayWrite(xx,&xarray));
1427   PetscCall(VecCUDAGetArrayRead(bb,&barray));
1428 
1429   PetscCall(PetscLogGpuTimeBegin());
1430   /* First, solve U */
1431   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1432                         upTriFactorT->csrMat->num_rows,
1433                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1434                         upTriFactorT->csrMat->num_entries,
1435                       #endif
1436                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1437                         upTriFactorT->csrMat->values->data().get(),
1438                         upTriFactorT->csrMat->row_offsets->data().get(),
1439                         upTriFactorT->csrMat->column_indices->data().get(),
1440                         upTriFactorT->solveInfo,
1441                         barray,
1442                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1443                         tempGPU->data().get(),
1444                         upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);PetscCallCUSPARSE(stat);
1445                       #else
1446                         tempGPU->data().get());PetscCallCUSPARSE(stat);
1447                       #endif
1448 
1449   /* Then, solve L */
1450   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1451                         loTriFactorT->csrMat->num_rows,
1452                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1453                         loTriFactorT->csrMat->num_entries,
1454                       #endif
1455                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1456                         loTriFactorT->csrMat->values->data().get(),
1457                         loTriFactorT->csrMat->row_offsets->data().get(),
1458                         loTriFactorT->csrMat->column_indices->data().get(),
1459                         loTriFactorT->solveInfo,
1460                         tempGPU->data().get(),
1461                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1462                         xarray,
1463                         loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);PetscCallCUSPARSE(stat);
1464                       #else
1465                         xarray);PetscCallCUSPARSE(stat);
1466                       #endif
1467 
1468   /* restore */
1469   PetscCall(VecCUDARestoreArrayRead(bb,&barray));
1470   PetscCall(VecCUDARestoreArrayWrite(xx,&xarray));
1471   PetscCall(PetscLogGpuTimeEnd());
1472   PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n));
1473   PetscFunctionReturn(0);
1474 }
1475 
1476 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1477 {
1478   const PetscScalar                     *barray;
1479   PetscScalar                           *xarray;
1480   thrust::device_ptr<const PetscScalar> bGPU;
1481   thrust::device_ptr<PetscScalar>       xGPU;
1482   cusparseStatus_t                      stat;
1483   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1484   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1485   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1486   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1487 
1488   PetscFunctionBegin;
1489 
1490   /* Get the GPU pointers */
1491   PetscCall(VecCUDAGetArrayWrite(xx,&xarray));
1492   PetscCall(VecCUDAGetArrayRead(bb,&barray));
1493   xGPU = thrust::device_pointer_cast(xarray);
1494   bGPU = thrust::device_pointer_cast(barray);
1495 
1496   PetscCall(PetscLogGpuTimeBegin());
1497   /* First, reorder with the row permutation */
1498   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1499                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1500                tempGPU->begin());
1501 
1502   /* Next, solve L */
1503   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1504                         loTriFactor->csrMat->num_rows,
1505                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1506                         loTriFactor->csrMat->num_entries,
1507                       #endif
1508                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1509                         loTriFactor->csrMat->values->data().get(),
1510                         loTriFactor->csrMat->row_offsets->data().get(),
1511                         loTriFactor->csrMat->column_indices->data().get(),
1512                         loTriFactor->solveInfo,
1513                         tempGPU->data().get(),
1514                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1515                          xarray,
1516                          loTriFactor->solvePolicy, loTriFactor->solveBuffer);PetscCallCUSPARSE(stat);
1517                       #else
1518                          xarray);PetscCallCUSPARSE(stat);
1519                       #endif
1520 
1521   /* Then, solve U */
1522   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1523                         upTriFactor->csrMat->num_rows,
1524                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1525                         upTriFactor->csrMat->num_entries,
1526                       #endif
1527                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1528                         upTriFactor->csrMat->values->data().get(),
1529                         upTriFactor->csrMat->row_offsets->data().get(),
1530                         upTriFactor->csrMat->column_indices->data().get(),
1531                         upTriFactor->solveInfo,xarray,
1532                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1533                         tempGPU->data().get(),
1534                         upTriFactor->solvePolicy, upTriFactor->solveBuffer);PetscCallCUSPARSE(stat);
1535                       #else
1536                         tempGPU->data().get());PetscCallCUSPARSE(stat);
1537                       #endif
1538 
1539   /* Last, reorder with the column permutation */
1540   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1541                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
1542                xGPU);
1543 
1544   PetscCall(VecCUDARestoreArrayRead(bb,&barray));
1545   PetscCall(VecCUDARestoreArrayWrite(xx,&xarray));
1546   PetscCall(PetscLogGpuTimeEnd());
1547   PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n));
1548   PetscFunctionReturn(0);
1549 }
1550 
1551 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1552 {
1553   const PetscScalar                 *barray;
1554   PetscScalar                       *xarray;
1555   cusparseStatus_t                  stat;
1556   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1557   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1558   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1559   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1560 
1561   PetscFunctionBegin;
1562   /* Get the GPU pointers */
1563   PetscCall(VecCUDAGetArrayWrite(xx,&xarray));
1564   PetscCall(VecCUDAGetArrayRead(bb,&barray));
1565 
1566   PetscCall(PetscLogGpuTimeBegin());
1567   /* First, solve L */
1568   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1569                         loTriFactor->csrMat->num_rows,
1570                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1571                         loTriFactor->csrMat->num_entries,
1572                       #endif
1573                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1574                         loTriFactor->csrMat->values->data().get(),
1575                         loTriFactor->csrMat->row_offsets->data().get(),
1576                         loTriFactor->csrMat->column_indices->data().get(),
1577                         loTriFactor->solveInfo,
1578                         barray,
1579                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1580                         tempGPU->data().get(),
1581                         loTriFactor->solvePolicy,loTriFactor->solveBuffer);PetscCallCUSPARSE(stat);
1582                       #else
1583                         tempGPU->data().get());PetscCallCUSPARSE(stat);
1584                       #endif
1585 
1586   /* Next, solve U */
1587   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1588                         upTriFactor->csrMat->num_rows,
1589                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1590                         upTriFactor->csrMat->num_entries,
1591                       #endif
1592                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1593                         upTriFactor->csrMat->values->data().get(),
1594                         upTriFactor->csrMat->row_offsets->data().get(),
1595                         upTriFactor->csrMat->column_indices->data().get(),
1596                         upTriFactor->solveInfo,
1597                         tempGPU->data().get(),
1598                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1599                         xarray,
1600                         upTriFactor->solvePolicy, upTriFactor->solveBuffer);PetscCallCUSPARSE(stat);
1601                       #else
1602                         xarray);PetscCallCUSPARSE(stat);
1603                       #endif
1604 
1605   PetscCall(VecCUDARestoreArrayRead(bb,&barray));
1606   PetscCall(VecCUDARestoreArrayWrite(xx,&xarray));
1607   PetscCall(PetscLogGpuTimeEnd());
1608   PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n));
1609   PetscFunctionReturn(0);
1610 }
1611 
1612 #if CUSPARSE_VERSION >= 11500
1613 /* cusparseSpSV_solve() and friends first appeared in cusparse-11.3 */
1614 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ILU0(Mat fact,Vec b,Vec x)
1615 {
1616   Mat_SeqAIJCUSPARSETriFactors  *fs = (Mat_SeqAIJCUSPARSETriFactors*)fact->spptr;
1617   Mat_SeqAIJ                    *aij = (Mat_SeqAIJ*)fact->data;
1618   const PetscScalar             *barray;
1619   PetscScalar                   *xarray;
1620 
1621   PetscFunctionBegin;
1622   PetscCall(VecCUDAGetArrayWrite(x,&xarray));
1623   PetscCall(VecCUDAGetArrayRead(b,&barray));
1624   PetscCall(PetscLogGpuTimeBegin());
1625 
1626   /* Solve L*y = b */
1627   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X,(void*)barray));
1628   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y,fs->Y));
1629   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle,
1630                     CUSPARSE_OPERATION_NON_TRANSPOSE,
1631                     &PETSC_CUSPARSE_ONE,
1632                     fs->spMatDescr_L, /* L Y = X */
1633                     fs->dnVecDescr_X,
1634                     fs->dnVecDescr_Y,
1635                     cusparse_scalartype,
1636                     CUSPARSE_SPSV_ALG_DEFAULT,
1637                     fs->spsvDescr_L)); // cusparseSpSV_solve() scretely uses the external buffer used in cusparseSpSV_analysis()!
1638 
1639   /* Solve U*x = y */
1640   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X,xarray));
1641   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle,
1642                     CUSPARSE_OPERATION_NON_TRANSPOSE,
1643                     &PETSC_CUSPARSE_ONE,
1644                     fs->spMatDescr_U, /* U X = Y */
1645                     fs->dnVecDescr_Y,
1646                     fs->dnVecDescr_X,
1647                     cusparse_scalartype,
1648                     CUSPARSE_SPSV_ALG_DEFAULT,
1649                     fs->spsvDescr_U));
1650 
1651   PetscCall(VecCUDARestoreArrayRead(b,&barray));
1652   PetscCall(VecCUDARestoreArrayWrite(x,&xarray));
1653 
1654   PetscCall(PetscLogGpuTimeEnd());
1655   PetscCall(PetscLogGpuFlops(2.0*aij->nz - fact->rmap->n));
1656   PetscFunctionReturn(0);
1657 }
1658 
1659 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_ILU0(Mat fact,Vec b,Vec x)
1660 {
1661   Mat_SeqAIJCUSPARSETriFactors  *fs = (Mat_SeqAIJCUSPARSETriFactors*)fact->spptr;
1662   Mat_SeqAIJ                    *aij = (Mat_SeqAIJ*)fact->data;
1663   const PetscScalar             *barray;
1664   PetscScalar                   *xarray;
1665 
1666   PetscFunctionBegin;
1667   if (!fs->createdTransposeSpSVDescr) { /* Call MatSolveTranspose() for the first time */
1668     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
1669     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle,
1670                       CUSPARSE_OPERATION_TRANSPOSE,
1671                       &PETSC_CUSPARSE_ONE,
1672                       fs->spMatDescr_L, /* The matrix is still L. We only do tranpose solve with it */
1673                       fs->dnVecDescr_X,
1674                       fs->dnVecDescr_Y,
1675                       cusparse_scalartype,
1676                       CUSPARSE_SPSV_ALG_DEFAULT,
1677                       fs->spsvDescr_Lt,
1678                       &fs->spsvBufferSize_Lt));
1679 
1680     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut));
1681     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle,
1682                       CUSPARSE_OPERATION_TRANSPOSE,
1683                       &PETSC_CUSPARSE_ONE,
1684                       fs->spMatDescr_U,
1685                       fs->dnVecDescr_X,
1686                       fs->dnVecDescr_Y,
1687                       cusparse_scalartype,
1688                       CUSPARSE_SPSV_ALG_DEFAULT,
1689                       fs->spsvDescr_Ut,
1690                       &fs->spsvBufferSize_Ut));
1691     PetscCallCUDA(cudaMalloc((void**)&fs->spsvBuffer_Lt,fs->spsvBufferSize_Lt));
1692     PetscCallCUDA(cudaMalloc((void**)&fs->spsvBuffer_Ut,fs->spsvBufferSize_Ut));
1693     fs->createdTransposeSpSVDescr = PETSC_TRUE;
1694   }
1695 
1696   if (!fs->updatedTransposeSpSVAnalysis) {
1697     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle,
1698                     CUSPARSE_OPERATION_TRANSPOSE,
1699                     &PETSC_CUSPARSE_ONE,
1700                     fs->spMatDescr_L,
1701                     fs->dnVecDescr_X,
1702                     fs->dnVecDescr_Y,
1703                     cusparse_scalartype,
1704                     CUSPARSE_SPSV_ALG_DEFAULT,
1705                     fs->spsvDescr_Lt,
1706                     fs->spsvBuffer_Lt));
1707 
1708     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle,
1709                     CUSPARSE_OPERATION_TRANSPOSE,
1710                     &PETSC_CUSPARSE_ONE,
1711                     fs->spMatDescr_U,
1712                     fs->dnVecDescr_X,
1713                     fs->dnVecDescr_Y,
1714                     cusparse_scalartype,
1715                     CUSPARSE_SPSV_ALG_DEFAULT,
1716                     fs->spsvDescr_Ut,
1717                     fs->spsvBuffer_Ut));
1718     fs->updatedTransposeSpSVAnalysis = PETSC_TRUE;
1719   }
1720 
1721   PetscCall(VecCUDAGetArrayWrite(x,&xarray));
1722   PetscCall(VecCUDAGetArrayRead(b,&barray));
1723   PetscCall(PetscLogGpuTimeBegin());
1724 
1725   /* Solve Ut*y = b */
1726   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X,(void*)barray));
1727   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y,fs->Y));
1728   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle,
1729                     CUSPARSE_OPERATION_TRANSPOSE,
1730                     &PETSC_CUSPARSE_ONE,
1731                     fs->spMatDescr_U, /* Ut Y = X */
1732                     fs->dnVecDescr_X,
1733                     fs->dnVecDescr_Y,
1734                     cusparse_scalartype,
1735                     CUSPARSE_SPSV_ALG_DEFAULT,
1736                     fs->spsvDescr_Ut));
1737 
1738   /* Solve Lt*x = y */
1739   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X,xarray));
1740   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle,
1741                     CUSPARSE_OPERATION_TRANSPOSE,
1742                     &PETSC_CUSPARSE_ONE,
1743                     fs->spMatDescr_L, /* Lt X = Y */
1744                     fs->dnVecDescr_Y,
1745                     fs->dnVecDescr_X,
1746                     cusparse_scalartype,
1747                     CUSPARSE_SPSV_ALG_DEFAULT,
1748                     fs->spsvDescr_Lt));
1749 
1750   PetscCall(VecCUDARestoreArrayRead(b,&barray));
1751   PetscCall(VecCUDARestoreArrayWrite(x,&xarray));
1752   PetscCall(PetscLogGpuTimeEnd());
1753   PetscCall(PetscLogGpuFlops(2.0*aij->nz - fact->rmap->n));
1754   PetscFunctionReturn(0);
1755 }
1756 
1757 static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact,Mat A,const MatFactorInfo *info)
1758 {
1759   Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors*)fact->spptr;
1760   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ*)fact->data;
1761   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1762   CsrMatrix                    *Acsr;
1763   PetscInt                     m,nz;
1764   PetscBool                    flg;
1765 
1766   PetscFunctionBegin;
1767   if (PetscDefined(USE_DEBUG)) {
1768     PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
1769     PetscCheck(flg,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Expected MATSEQAIJCUSPARSE, but input is %s",((PetscObject)A)->type_name);
1770   }
1771 
1772   /* Copy A's value to fact */
1773   m  = fact->rmap->n;
1774   nz = aij->nz;
1775   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1776   Acsr = (CsrMatrix*)Acusp->mat->mat;
1777   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal,Acsr->values->data().get(),sizeof(PetscScalar)*nz,cudaMemcpyDeviceToDevice,PetscDefaultCudaStream));
1778 
1779   /* Factorize fact inplace */
1780   if (m) PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1781                     fs->matDescr_M,
1782                     fs->csrVal,
1783                     fs->csrRowPtr,
1784                     fs->csrColIdx,
1785                     fs->ilu0Info_M,
1786                     fs->policy_M,
1787                     fs->factBuffer_M));
1788   if (PetscDefined(USE_DEBUG)) {
1789     int              numerical_zero;
1790     cusparseStatus_t status;
1791     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero);
1792     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status,PETSC_COMM_SELF,PETSC_ERR_USER_INPUT,"Numerical zero pivot detected in csrilu02: A(%d,%d) is zero",numerical_zero,numerical_zero);
1793   }
1794 
1795   /* cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02()
1796      See discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/78
1797   */
1798   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle,
1799                     CUSPARSE_OPERATION_NON_TRANSPOSE,
1800                     &PETSC_CUSPARSE_ONE,
1801                     fs->spMatDescr_L,
1802                     fs->dnVecDescr_X,
1803                     fs->dnVecDescr_Y,
1804                     cusparse_scalartype,
1805                     CUSPARSE_SPSV_ALG_DEFAULT,
1806                     fs->spsvDescr_L,
1807                     fs->spsvBuffer_L));
1808 
1809   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle,
1810                     CUSPARSE_OPERATION_NON_TRANSPOSE,
1811                     &PETSC_CUSPARSE_ONE,
1812                     fs->spMatDescr_U,
1813                     fs->dnVecDescr_X,
1814                     fs->dnVecDescr_Y,
1815                     cusparse_scalartype,
1816                     CUSPARSE_SPSV_ALG_DEFAULT,
1817                     fs->spsvDescr_U,
1818                     fs->spsvBuffer_U));
1819 
1820   /* L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve */
1821   fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
1822 
1823   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1824   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_ILU0;
1825   fact->ops->solvetranspose    = MatSolveTranspose_SeqAIJCUSPARSE_ILU0;
1826   fact->ops->matsolve          = NULL;
1827   fact->ops->matsolvetranspose = NULL;
1828   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1829   PetscFunctionReturn(0);
1830 }
1831 
1832 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1833 {
1834   Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors*)fact->spptr;
1835   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ*)fact->data;
1836   PetscInt                     m,nz;
1837 
1838   PetscFunctionBegin;
1839   if (PetscDefined(USE_DEBUG)) {
1840     PetscInt  i;
1841     PetscBool flg,missing;
1842 
1843     PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
1844     PetscCheck(flg,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Expected MATSEQAIJCUSPARSE, but input is %s",((PetscObject)A)->type_name);
1845     PetscCheck(A->rmap->n == A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT,A->rmap->n,A->cmap->n);
1846     PetscCall(MatMissingDiagonal(A,&missing,&i));
1847     PetscCheck(!missing,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %" PetscInt_FMT,i);
1848   }
1849 
1850   /* Free the old stale stuff */
1851   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
1852 
1853   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1854      but they will not be used. Allocate them just for easy debugging.
1855    */
1856   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE/*malloc*/));
1857 
1858   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
1859   fact->factortype             = MAT_FACTOR_ILU;
1860   fact->info.factor_mallocs    = 0;
1861   fact->info.fill_ratio_given  = info->fill;
1862   fact->info.fill_ratio_needed = 1.0;
1863 
1864   aij->row = NULL;
1865   aij->col = NULL;
1866 
1867   /* ====================================================================== */
1868   /* Copy A's i, j to fact and also allocate the value array of fact.       */
1869   /* We'll do in-place factorization on fact                                */
1870   /* ====================================================================== */
1871   const int *Ai,*Aj;
1872 
1873   m  = fact->rmap->n;
1874   nz = aij->nz;
1875 
1876   PetscCallCUDA(cudaMalloc((void**)&fs->csrRowPtr,sizeof(int)*(m+1)));
1877   PetscCallCUDA(cudaMalloc((void**)&fs->csrColIdx,sizeof(int)*nz));
1878   PetscCallCUDA(cudaMalloc((void**)&fs->csrVal,sizeof(PetscScalar)*nz));
1879   PetscCall(MatSeqAIJCUSPARSEGetIJ(A,PETSC_FALSE,&Ai,&Aj));  /* Do not use compressed Ai */
1880   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr,Ai,sizeof(int)*(m+1),cudaMemcpyDeviceToDevice,PetscDefaultCudaStream));
1881   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx,Aj,sizeof(int)*nz,cudaMemcpyDeviceToDevice,PetscDefaultCudaStream));
1882 
1883   /* ====================================================================== */
1884   /* Create descriptors for M, L, U                                         */
1885   /* ====================================================================== */
1886   cusparseFillMode_t fillMode;
1887   cusparseDiagType_t diagType;
1888 
1889   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
1890   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
1891   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
1892 
1893   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
1894     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1895     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1896     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1897     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1898   */
1899   fillMode = CUSPARSE_FILL_MODE_LOWER;
1900   diagType = CUSPARSE_DIAG_TYPE_UNIT;
1901   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L,m,m,nz,
1902                     fs->csrRowPtr,
1903                     fs->csrColIdx,
1904                     fs->csrVal,
1905                     CUSPARSE_INDEX_32I,
1906                     CUSPARSE_INDEX_32I,
1907                     CUSPARSE_INDEX_BASE_ZERO,
1908                     cusparse_scalartype));
1909   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L,
1910                     CUSPARSE_SPMAT_FILL_MODE,
1911                     &fillMode,
1912                     sizeof(fillMode)));
1913   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L,
1914                     CUSPARSE_SPMAT_DIAG_TYPE,
1915                     &diagType,
1916                     sizeof(diagType)));
1917 
1918   fillMode = CUSPARSE_FILL_MODE_UPPER;
1919   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
1920   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U,m,m,nz,
1921                     fs->csrRowPtr,
1922                     fs->csrColIdx,
1923                     fs->csrVal,
1924                     CUSPARSE_INDEX_32I,
1925                     CUSPARSE_INDEX_32I,
1926                     CUSPARSE_INDEX_BASE_ZERO,
1927                     cusparse_scalartype));
1928   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U,
1929                     CUSPARSE_SPMAT_FILL_MODE,
1930                     &fillMode,
1931                     sizeof(fillMode)));
1932   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U,
1933                     CUSPARSE_SPMAT_DIAG_TYPE,
1934                     &diagType,
1935                     sizeof(diagType)));
1936 
1937   /* ========================================================================= */
1938   /* Query buffer sizes for csrilu0, SpSV and allocate buffers                 */
1939   /* ========================================================================= */
1940   PetscCallCUSPARSE(cusparseCreateCsrilu02Info(&fs->ilu0Info_M));
1941   if (m) PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1942                     fs->matDescr_M,
1943                     fs->csrVal,
1944                     fs->csrRowPtr,
1945                     fs->csrColIdx,
1946                     fs->ilu0Info_M,
1947                     &fs->factBufferSize_M));
1948 
1949   PetscCallCUDA(cudaMalloc((void**)&fs->X,sizeof(PetscScalar)*m));
1950   PetscCallCUDA(cudaMalloc((void**)&fs->Y,sizeof(PetscScalar)*m));
1951 
1952   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X,m,fs->X,cusparse_scalartype));
1953   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y,m,fs->Y,cusparse_scalartype));
1954 
1955   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
1956   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle,
1957                     CUSPARSE_OPERATION_NON_TRANSPOSE,
1958                     &PETSC_CUSPARSE_ONE,
1959                     fs->spMatDescr_L,
1960                     fs->dnVecDescr_X,
1961                     fs->dnVecDescr_Y,
1962                     cusparse_scalartype,
1963                     CUSPARSE_SPSV_ALG_DEFAULT,
1964                     fs->spsvDescr_L,
1965                     &fs->spsvBufferSize_L));
1966 
1967   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
1968   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle,
1969                     CUSPARSE_OPERATION_NON_TRANSPOSE,
1970                     &PETSC_CUSPARSE_ONE,
1971                     fs->spMatDescr_U,
1972                     fs->dnVecDescr_X,
1973                     fs->dnVecDescr_Y,
1974                     cusparse_scalartype,
1975                     CUSPARSE_SPSV_ALG_DEFAULT,
1976                     fs->spsvDescr_U,
1977                     &fs->spsvBufferSize_U));
1978 
1979   /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab,
1980      and discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/77,
1981      spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U.
1982      To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U.
1983    */
1984   if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) {
1985     PetscCallCUDA(cudaMalloc((void**)&fs->factBuffer_M,PetscMax(fs->spsvBufferSize_L,(size_t)fs->factBufferSize_M)));
1986     fs->spsvBuffer_L = fs->factBuffer_M;
1987     PetscCallCUDA(cudaMalloc((void**)&fs->spsvBuffer_U,fs->spsvBufferSize_U));
1988   } else {
1989     PetscCallCUDA(cudaMalloc((void**)&fs->factBuffer_M,PetscMax(fs->spsvBufferSize_U,(size_t)fs->factBufferSize_M)));
1990     fs->spsvBuffer_U = fs->factBuffer_M;
1991     PetscCallCUDA(cudaMalloc((void**)&fs->spsvBuffer_L,fs->spsvBufferSize_L));
1992   }
1993 
1994   /* ========================================================================== */
1995   /* Perform analysis of ilu0 on M, SpSv on L and U                             */
1996   /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/
1997   /* ========================================================================== */
1998   int              structural_zero;
1999   cusparseStatus_t status;
2000 
2001   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
2002   if (m) PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
2003                     fs->matDescr_M,
2004                     fs->csrVal,
2005                     fs->csrRowPtr,
2006                     fs->csrColIdx,
2007                     fs->ilu0Info_M,
2008                     fs->policy_M,
2009                     fs->factBuffer_M));
2010   if (PetscDefined(USE_DEBUG)) {
2011     /* Function cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
2012     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero);
2013     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status,PETSC_COMM_SELF,PETSC_ERR_USER_INPUT,"Structural zero pivot detected in csrilu02: A(%d,%d) is missing",structural_zero,structural_zero);
2014   }
2015 
2016   /* Estimate FLOPs of the numeric factorization */
2017  {
2018     Mat_SeqAIJ     *Aseq = (Mat_SeqAIJ*)A->data;
2019     PetscInt       *Ai,*Adiag,nzRow,nzLeft;
2020     PetscLogDouble flops = 0.0;
2021 
2022     PetscCall(MatMarkDiagonal_SeqAIJ(A));
2023     Ai    = Aseq->i;
2024     Adiag = Aseq->diag;
2025     for (PetscInt i=0; i<m; i++) {
2026       if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i+1]) { /* There are nonzeros left to the diagonal of row i */
2027         nzRow  = Ai[i+1] - Ai[i];
2028         nzLeft = Adiag[i] - Ai[i];
2029         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
2030           and include the eliminated one will be updated, which incurs a multiplication and an addition.
2031         */
2032         nzLeft = (nzRow-1)/2;
2033         flops += nzLeft*(2.0*nzRow-nzLeft+1);
2034       }
2035     }
2036     fs->numericFactFlops = flops;
2037   }
2038   fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0;
2039   PetscFunctionReturn(0);
2040 }
2041 
2042 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact,Vec b,Vec x)
2043 {
2044   Mat_SeqAIJCUSPARSETriFactors  *fs = (Mat_SeqAIJCUSPARSETriFactors*)fact->spptr;
2045   Mat_SeqAIJ                    *aij = (Mat_SeqAIJ*)fact->data;
2046   const PetscScalar             *barray;
2047   PetscScalar                   *xarray;
2048 
2049   PetscFunctionBegin;
2050   PetscCall(VecCUDAGetArrayWrite(x,&xarray));
2051   PetscCall(VecCUDAGetArrayRead(b,&barray));
2052   PetscCall(PetscLogGpuTimeBegin());
2053 
2054   /* Solve L*y = b */
2055   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X,(void*)barray));
2056   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y,fs->Y));
2057   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle,
2058                     CUSPARSE_OPERATION_NON_TRANSPOSE,
2059                     &PETSC_CUSPARSE_ONE,
2060                     fs->spMatDescr_L, /* L Y = X */
2061                     fs->dnVecDescr_X,
2062                     fs->dnVecDescr_Y,
2063                     cusparse_scalartype,
2064                     CUSPARSE_SPSV_ALG_DEFAULT,
2065                     fs->spsvDescr_L));
2066 
2067   /* Solve Lt*x = y */
2068   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X,xarray));
2069   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle,
2070                     CUSPARSE_OPERATION_TRANSPOSE,
2071                     &PETSC_CUSPARSE_ONE,
2072                     fs->spMatDescr_L, /* Lt X = Y */
2073                     fs->dnVecDescr_Y,
2074                     fs->dnVecDescr_X,
2075                     cusparse_scalartype,
2076                     CUSPARSE_SPSV_ALG_DEFAULT,
2077                     fs->spsvDescr_Lt));
2078 
2079   PetscCall(VecCUDARestoreArrayRead(b,&barray));
2080   PetscCall(VecCUDARestoreArrayWrite(x,&xarray));
2081 
2082   PetscCall(PetscLogGpuTimeEnd());
2083   PetscCall(PetscLogGpuFlops(2.0*aij->nz - fact->rmap->n));
2084   PetscFunctionReturn(0);
2085 }
2086 
2087 static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact,Mat A,const MatFactorInfo *info)
2088 {
2089   Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors*)fact->spptr;
2090   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ*)fact->data;
2091   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2092   CsrMatrix                    *Acsr;
2093   PetscInt                     m,nz;
2094   PetscBool                    flg;
2095 
2096   PetscFunctionBegin;
2097   if (PetscDefined(USE_DEBUG)) {
2098     PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
2099     PetscCheck(flg,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Expected MATSEQAIJCUSPARSE, but input is %s",((PetscObject)A)->type_name);
2100   }
2101 
2102   /* Copy A's value to fact */
2103   m  = fact->rmap->n;
2104   nz = aij->nz;
2105   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2106   Acsr = (CsrMatrix*)Acusp->mat->mat;
2107   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal,Acsr->values->data().get(),sizeof(PetscScalar)*nz,cudaMemcpyDeviceToDevice,PetscDefaultCudaStream));
2108 
2109   /* Factorize fact inplace */
2110   /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve
2111      Function csric02() only takes the lower triangular part of matrix A to perform factorization.
2112      The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored,
2113      and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not.
2114      In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided.
2115    */
2116   if (m) PetscCallCUSPARSE(cusparseXcsric02(fs->handle, m, nz,
2117                     fs->matDescr_M,
2118                     fs->csrVal,
2119                     fs->csrRowPtr,
2120                     fs->csrColIdx,
2121                     fs->ic0Info_M,
2122                     fs->policy_M,
2123                     fs->factBuffer_M));
2124   if (PetscDefined(USE_DEBUG)) {
2125     int              numerical_zero;
2126     cusparseStatus_t status;
2127     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero);
2128     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status,PETSC_COMM_SELF,PETSC_ERR_USER_INPUT,"Numerical zero pivot detected in csric02: A(%d,%d) is zero",numerical_zero,numerical_zero);
2129   }
2130 
2131   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle,
2132                     CUSPARSE_OPERATION_NON_TRANSPOSE,
2133                     &PETSC_CUSPARSE_ONE,
2134                     fs->spMatDescr_L,
2135                     fs->dnVecDescr_X,
2136                     fs->dnVecDescr_Y,
2137                     cusparse_scalartype,
2138                     CUSPARSE_SPSV_ALG_DEFAULT,
2139                     fs->spsvDescr_L,
2140                     fs->spsvBuffer_L));
2141 
2142   /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE
2143     ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F
2144   */
2145   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle,
2146                     CUSPARSE_OPERATION_TRANSPOSE,
2147                     &PETSC_CUSPARSE_ONE,
2148                     fs->spMatDescr_L,
2149                     fs->dnVecDescr_X,
2150                     fs->dnVecDescr_Y,
2151                     cusparse_scalartype,
2152                     CUSPARSE_SPSV_ALG_DEFAULT,
2153                     fs->spsvDescr_Lt,
2154                     fs->spsvBuffer_Lt));
2155 
2156   fact->offloadmask            = PETSC_OFFLOAD_GPU;
2157   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_ICC0;
2158   fact->ops->solvetranspose    = MatSolve_SeqAIJCUSPARSE_ICC0;
2159   fact->ops->matsolve          = NULL;
2160   fact->ops->matsolvetranspose = NULL;
2161   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
2162   PetscFunctionReturn(0);
2163 }
2164 
2165 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact,Mat A,IS perm,const MatFactorInfo *info)
2166 {
2167   Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors*)fact->spptr;
2168   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ*)fact->data;
2169   PetscInt                     m,nz;
2170 
2171   PetscFunctionBegin;
2172   if (PetscDefined(USE_DEBUG)) {
2173     PetscInt  i;
2174     PetscBool flg,missing;
2175 
2176     PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
2177     PetscCheck(flg,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Expected MATSEQAIJCUSPARSE, but input is %s",((PetscObject)A)->type_name);
2178     PetscCheck(A->rmap->n == A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT,A->rmap->n,A->cmap->n);
2179     PetscCall(MatMissingDiagonal(A,&missing,&i));
2180     PetscCheck(!missing,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %" PetscInt_FMT,i);
2181   }
2182 
2183   /* Free the old stale stuff */
2184   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
2185 
2186   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
2187      but they will not be used. Allocate them just for easy debugging.
2188    */
2189   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE/*malloc*/));
2190 
2191   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
2192   fact->factortype             = MAT_FACTOR_ICC;
2193   fact->info.factor_mallocs    = 0;
2194   fact->info.fill_ratio_given  = info->fill;
2195   fact->info.fill_ratio_needed = 1.0;
2196 
2197   aij->row = NULL;
2198   aij->col = NULL;
2199 
2200   /* ====================================================================== */
2201   /* Copy A's i, j to fact and also allocate the value array of fact.       */
2202   /* We'll do in-place factorization on fact                                */
2203   /* ====================================================================== */
2204   const int *Ai,*Aj;
2205 
2206   m  = fact->rmap->n;
2207   nz = aij->nz;
2208 
2209   PetscCallCUDA(cudaMalloc((void**)&fs->csrRowPtr,sizeof(int)*(m+1)));
2210   PetscCallCUDA(cudaMalloc((void**)&fs->csrColIdx,sizeof(int)*nz));
2211   PetscCallCUDA(cudaMalloc((void**)&fs->csrVal,sizeof(PetscScalar)*nz));
2212   PetscCall(MatSeqAIJCUSPARSEGetIJ(A,PETSC_FALSE,&Ai,&Aj));  /* Do not use compressed Ai */
2213   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr,Ai,sizeof(int)*(m+1),cudaMemcpyDeviceToDevice,PetscDefaultCudaStream));
2214   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx,Aj,sizeof(int)*nz,cudaMemcpyDeviceToDevice,PetscDefaultCudaStream));
2215 
2216   /* ====================================================================== */
2217   /* Create mat descriptors for M, L                                        */
2218   /* ====================================================================== */
2219   cusparseFillMode_t fillMode;
2220   cusparseDiagType_t diagType;
2221 
2222   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
2223   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
2224   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
2225 
2226   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
2227     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
2228     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
2229     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
2230     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
2231   */
2232   fillMode = CUSPARSE_FILL_MODE_LOWER;
2233   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
2234   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L,m,m,nz,
2235                     fs->csrRowPtr,
2236                     fs->csrColIdx,
2237                     fs->csrVal,
2238                     CUSPARSE_INDEX_32I,
2239                     CUSPARSE_INDEX_32I,
2240                     CUSPARSE_INDEX_BASE_ZERO,
2241                     cusparse_scalartype));
2242   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L,
2243                     CUSPARSE_SPMAT_FILL_MODE,
2244                     &fillMode,
2245                     sizeof(fillMode)));
2246   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L,
2247                     CUSPARSE_SPMAT_DIAG_TYPE,
2248                     &diagType,
2249                     sizeof(diagType)));
2250 
2251   /* ========================================================================= */
2252   /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers     */
2253   /* ========================================================================= */
2254   PetscCallCUSPARSE(cusparseCreateCsric02Info(&fs->ic0Info_M));
2255   if (m) PetscCallCUSPARSE(cusparseXcsric02_bufferSize(fs->handle, m, nz,
2256                     fs->matDescr_M,
2257                     fs->csrVal,
2258                     fs->csrRowPtr,
2259                     fs->csrColIdx,
2260                     fs->ic0Info_M,
2261                     &fs->factBufferSize_M));
2262 
2263   PetscCallCUDA(cudaMalloc((void**)&fs->X,sizeof(PetscScalar)*m));
2264   PetscCallCUDA(cudaMalloc((void**)&fs->Y,sizeof(PetscScalar)*m));
2265 
2266   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X,m,fs->X,cusparse_scalartype));
2267   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y,m,fs->Y,cusparse_scalartype));
2268 
2269   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
2270   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle,
2271                     CUSPARSE_OPERATION_NON_TRANSPOSE,
2272                     &PETSC_CUSPARSE_ONE,
2273                     fs->spMatDescr_L,
2274                     fs->dnVecDescr_X,
2275                     fs->dnVecDescr_Y,
2276                     cusparse_scalartype,
2277                     CUSPARSE_SPSV_ALG_DEFAULT,
2278                     fs->spsvDescr_L,
2279                     &fs->spsvBufferSize_L));
2280 
2281   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
2282   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle,
2283                     CUSPARSE_OPERATION_TRANSPOSE,
2284                     &PETSC_CUSPARSE_ONE,
2285                     fs->spMatDescr_L,
2286                     fs->dnVecDescr_X,
2287                     fs->dnVecDescr_Y,
2288                     cusparse_scalartype,
2289                     CUSPARSE_SPSV_ALG_DEFAULT,
2290                     fs->spsvDescr_Lt,
2291                     &fs->spsvBufferSize_Lt));
2292 
2293   /* To save device memory, we make the factorization buffer share with one of the solver buffer.
2294      See also comments in MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0().
2295    */
2296   if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) {
2297     PetscCallCUDA(cudaMalloc((void**)&fs->factBuffer_M,PetscMax(fs->spsvBufferSize_L,(size_t)fs->factBufferSize_M)));
2298     fs->spsvBuffer_L = fs->factBuffer_M;
2299     PetscCallCUDA(cudaMalloc((void**)&fs->spsvBuffer_Lt,fs->spsvBufferSize_Lt));
2300   } else {
2301     PetscCallCUDA(cudaMalloc((void**)&fs->factBuffer_M,PetscMax(fs->spsvBufferSize_Lt,(size_t)fs->factBufferSize_M)));
2302     fs->spsvBuffer_Lt = fs->factBuffer_M;
2303     PetscCallCUDA(cudaMalloc((void**)&fs->spsvBuffer_L,fs->spsvBufferSize_L));
2304   }
2305 
2306   /* ========================================================================== */
2307   /* Perform analysis of ic0 on M                                               */
2308   /* The lower triangular part of M has the same sparsity pattern as L          */
2309   /* ========================================================================== */
2310   int              structural_zero;
2311   cusparseStatus_t status;
2312 
2313   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
2314   if (m) PetscCallCUSPARSE(cusparseXcsric02_analysis(fs->handle, m, nz,
2315                     fs->matDescr_M,
2316                     fs->csrVal,
2317                     fs->csrRowPtr,
2318                     fs->csrColIdx,
2319                     fs->ic0Info_M,
2320                     fs->policy_M,
2321                     fs->factBuffer_M));
2322   if (PetscDefined(USE_DEBUG)) {
2323     /* Function cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
2324     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero);
2325     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status,PETSC_COMM_SELF,PETSC_ERR_USER_INPUT,"Structural zero pivot detected in csric02: A(%d,%d) is missing",structural_zero,structural_zero);
2326   }
2327 
2328   /* Estimate FLOPs of the numeric factorization */
2329   {
2330     Mat_SeqAIJ     *Aseq = (Mat_SeqAIJ*)A->data;
2331     PetscInt       *Ai,nzRow,nzLeft;
2332     PetscLogDouble flops = 0.0;
2333 
2334     Ai = Aseq->i;
2335     for (PetscInt i=0; i<m; i++) {
2336       nzRow = Ai[i+1] - Ai[i];
2337       if (nzRow > 1) {
2338         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
2339           and include the eliminated one will be updated, which incurs a multiplication and an addition.
2340         */
2341         nzLeft = (nzRow-1)/2;
2342         flops += nzLeft*(2.0*nzRow-nzLeft+1);
2343       }
2344     }
2345     fs->numericFactFlops = flops;
2346   }
2347   fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0;
2348   PetscFunctionReturn(0);
2349 }
2350 #endif
2351 
2352 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
2353 {
2354   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
2355 
2356   PetscFunctionBegin;
2357  #if CUSPARSE_VERSION >= 11500
2358   PetscBool row_identity = PETSC_FALSE,col_identity = PETSC_FALSE;
2359   if (cusparseTriFactors->factorizeOnDevice) {
2360     PetscCall(ISIdentity(isrow,&row_identity));
2361     PetscCall(ISIdentity(iscol,&col_identity));
2362   }
2363   if (!info->levels && row_identity && col_identity) {
2364     PetscCall(MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B,A,isrow,iscol,info));
2365   } else
2366  #endif
2367   {
2368     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2369     PetscCall(MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info));
2370     B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2371   }
2372   PetscFunctionReturn(0);
2373 }
2374 
2375 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
2376 {
2377   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
2378 
2379   PetscFunctionBegin;
2380   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2381   PetscCall(MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info));
2382   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2383   PetscFunctionReturn(0);
2384 }
2385 
2386 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
2387 {
2388   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
2389 
2390   PetscFunctionBegin;
2391  #if CUSPARSE_VERSION >= 11500
2392   PetscBool perm_identity = PETSC_FALSE;
2393   if (cusparseTriFactors->factorizeOnDevice) PetscCall(ISIdentity(perm,&perm_identity));
2394   if (!info->levels && perm_identity) {
2395     PetscCall(MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B,A,perm,info));
2396   } else
2397  #endif
2398   {
2399     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2400     PetscCall(MatICCFactorSymbolic_SeqAIJ(B,A,perm,info));
2401     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
2402   }
2403   PetscFunctionReturn(0);
2404 }
2405 
2406 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
2407 {
2408   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
2409 
2410   PetscFunctionBegin;
2411   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2412   PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info));
2413   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
2414   PetscFunctionReturn(0);
2415 }
2416 
2417 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
2418 {
2419   PetscFunctionBegin;
2420   *type = MATSOLVERCUSPARSE;
2421   PetscFunctionReturn(0);
2422 }
2423 
2424 /*MC
2425   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
2426   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
2427   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
2428   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
2429   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
2430   algorithms are not recommended. This class does NOT support direct solver operations.
2431 
2432   Level: beginner
2433 
2434 .seealso: `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
2435 M*/
2436 
2437 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
2438 {
2439   PetscInt       n = A->rmap->n;
2440   PetscBool      factOnDevice,factOnHost;
2441   char           *prefix;
2442   char           factPlace[32] = "device"; /* the default */
2443 
2444   PetscFunctionBegin;
2445   PetscCall(MatCreate(PetscObjectComm((PetscObject)A),B));
2446   PetscCall(MatSetSizes(*B,n,n,n,n));
2447   (*B)->factortype = ftype;
2448   PetscCall(MatSetType(*B,MATSEQAIJCUSPARSE));
2449 
2450   prefix = (*B)->factorprefix ? (*B)->factorprefix : ((PetscObject)A)->prefix;
2451   PetscOptionsBegin(PetscObjectComm((PetscObject)(*B)),prefix,"MatGetFactor","Mat");
2452   PetscCall(PetscOptionsString("-mat_factor_bind_factorization","Do matrix factorization on host or device when possible","MatGetFactor",NULL,factPlace,sizeof(factPlace),NULL));
2453   PetscOptionsEnd();
2454   PetscCall(PetscStrcasecmp("device",factPlace,&factOnDevice));
2455   PetscCall(PetscStrcasecmp("host",factPlace,&factOnHost));
2456   PetscCheck(factOnDevice || factOnHost,PetscObjectComm((PetscObject)(*B)),PETSC_ERR_ARG_OUTOFRANGE,"Wrong option %s to -mat_factor_bind_factorization <string>. Only host and device are allowed",factPlace);
2457   ((Mat_SeqAIJCUSPARSETriFactors*)(*B)->spptr)->factorizeOnDevice = factOnDevice;
2458 
2459   if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B,PETSC_TRUE));
2460   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
2461     PetscCall(MatSetBlockSizesFromMats(*B,A,A));
2462     if (!A->boundtocpu) {
2463       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
2464       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
2465     } else {
2466       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
2467       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJ;
2468     }
2469     PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]));
2470     PetscCall(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU]));
2471     PetscCall(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
2472   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
2473     if (!A->boundtocpu) {
2474       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
2475       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
2476     } else {
2477       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJ;
2478       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
2479     }
2480     PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
2481     PetscCall(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC]));
2482   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
2483 
2484   PetscCall(MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL));
2485   (*B)->canuseordering = PETSC_TRUE;
2486   PetscCall(PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse));
2487   PetscFunctionReturn(0);
2488 }
2489 
2490 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
2491 {
2492   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
2493   Mat_SeqAIJCUSPARSE           *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2494  #if CUSPARSE_VERSION >= 13500
2495   Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
2496  #endif
2497 
2498   PetscFunctionBegin;
2499   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
2500     PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0));
2501     if (A->factortype == MAT_FACTOR_NONE) {
2502       CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat;
2503       PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2504     }
2505    #if CUSPARSE_VERSION >= 13500
2506     else if (fs->csrVal) {
2507       /* We have a factorized matrix on device and are able to copy it to host */
2508       PetscCallCUDA(cudaMemcpy(a->a, fs->csrVal, a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2509     }
2510    #endif
2511     else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for copying this type of factorized matrix from device to host");
2512     PetscCall(PetscLogGpuToCpu(a->nz*sizeof(PetscScalar)));
2513     PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0));
2514     A->offloadmask = PETSC_OFFLOAD_BOTH;
2515   }
2516   PetscFunctionReturn(0);
2517 }
2518 
2519 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
2520 {
2521   PetscFunctionBegin;
2522   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2523   *array = ((Mat_SeqAIJ*)A->data)->a;
2524   PetscFunctionReturn(0);
2525 }
2526 
2527 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
2528 {
2529   PetscFunctionBegin;
2530   A->offloadmask = PETSC_OFFLOAD_CPU;
2531   *array         = NULL;
2532   PetscFunctionReturn(0);
2533 }
2534 
2535 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[])
2536 {
2537   PetscFunctionBegin;
2538   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2539   *array = ((Mat_SeqAIJ*)A->data)->a;
2540   PetscFunctionReturn(0);
2541 }
2542 
2543 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[])
2544 {
2545   PetscFunctionBegin;
2546   *array = NULL;
2547   PetscFunctionReturn(0);
2548 }
2549 
2550 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
2551 {
2552   PetscFunctionBegin;
2553   *array = ((Mat_SeqAIJ*)A->data)->a;
2554   PetscFunctionReturn(0);
2555 }
2556 
2557 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
2558 {
2559   PetscFunctionBegin;
2560   A->offloadmask = PETSC_OFFLOAD_CPU;
2561   *array         = NULL;
2562   PetscFunctionReturn(0);
2563 }
2564 
2565 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A,const PetscInt **i,const PetscInt **j,PetscScalar **a,PetscMemType *mtype)
2566 {
2567   Mat_SeqAIJCUSPARSE           *cusp;
2568   CsrMatrix                    *matrix;
2569 
2570   PetscFunctionBegin;
2571   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2572   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
2573   cusp = static_cast<Mat_SeqAIJCUSPARSE*>(A->spptr);
2574   PetscCheck(cusp != NULL,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"cusp is NULL");
2575   matrix = (CsrMatrix*)cusp->mat->mat;
2576 
2577   if (i) {
2578    #if !defined(PETSC_USE_64BIT_INDICES)
2579     *i = matrix->row_offsets->data().get();
2580    #else
2581     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSparse does not supported 64-bit indices");
2582    #endif
2583   }
2584   if (j) {
2585    #if !defined(PETSC_USE_64BIT_INDICES)
2586     *j = matrix->column_indices->data().get();
2587    #else
2588     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSparse does not supported 64-bit indices");
2589    #endif
2590   }
2591   if (a) *a = matrix->values->data().get();
2592   if (mtype) *mtype = PETSC_MEMTYPE_CUDA;
2593   PetscFunctionReturn(0);
2594 }
2595 
2596 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
2597 {
2598   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
2599   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
2600   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
2601   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
2602   cusparseStatus_t             stat;
2603   PetscBool                    both = PETSC_TRUE;
2604 
2605   PetscFunctionBegin;
2606   PetscCheck(!A->boundtocpu,PETSC_COMM_SELF,PETSC_ERR_GPU,"Cannot copy to GPU");
2607   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
2608     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
2609       CsrMatrix *matrix;
2610       matrix = (CsrMatrix*)cusparsestruct->mat->mat;
2611 
2612       PetscCheck(!a->nz || a->a,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR values");
2613       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0));
2614       matrix->values->assign(a->a, a->a+a->nz);
2615       PetscCallCUDA(WaitForCUDA());
2616       PetscCall(PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar)));
2617       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0));
2618       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE));
2619     } else {
2620       PetscInt nnz;
2621       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0));
2622       PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format));
2623       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE));
2624       delete cusparsestruct->workVector;
2625       delete cusparsestruct->rowoffsets_gpu;
2626       cusparsestruct->workVector = NULL;
2627       cusparsestruct->rowoffsets_gpu = NULL;
2628       try {
2629         if (a->compressedrow.use) {
2630           m    = a->compressedrow.nrows;
2631           ii   = a->compressedrow.i;
2632           ridx = a->compressedrow.rindex;
2633         } else {
2634           m    = A->rmap->n;
2635           ii   = a->i;
2636           ridx = NULL;
2637         }
2638         PetscCheck(ii,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR row data");
2639         if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; }
2640         else nnz = a->nz;
2641         PetscCheck(!nnz || a->j,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR column data");
2642 
2643         /* create cusparse matrix */
2644         cusparsestruct->nrows = m;
2645         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
2646         PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr));
2647         PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO));
2648         PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
2649 
2650         PetscCallCUDA(cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar)));
2651         PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar)));
2652         PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar)));
2653         PetscCallCUDA(cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
2654         PetscCallCUDA(cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice));
2655         PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
2656         PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE));
2657 
2658         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
2659         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
2660           /* set the matrix */
2661           CsrMatrix *mat= new CsrMatrix;
2662           mat->num_rows = m;
2663           mat->num_cols = A->cmap->n;
2664           mat->num_entries = nnz;
2665           mat->row_offsets = new THRUSTINTARRAY32(m+1);
2666           mat->row_offsets->assign(ii, ii + m+1);
2667 
2668           mat->column_indices = new THRUSTINTARRAY32(nnz);
2669           mat->column_indices->assign(a->j, a->j+nnz);
2670 
2671           mat->values = new THRUSTARRAY(nnz);
2672           if (a->a) mat->values->assign(a->a, a->a+nnz);
2673 
2674           /* assign the pointer */
2675           matstruct->mat = mat;
2676          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2677           if (mat->num_rows) { /* cusparse errors on empty matrices! */
2678             stat = cusparseCreateCsr(&matstruct->matDescr,
2679                                     mat->num_rows, mat->num_cols, mat->num_entries,
2680                                     mat->row_offsets->data().get(), mat->column_indices->data().get(),
2681                                     mat->values->data().get(),
2682                                     CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2683                                     CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);PetscCallCUSPARSE(stat);
2684           }
2685          #endif
2686         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
2687          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2688           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2689          #else
2690           CsrMatrix *mat= new CsrMatrix;
2691           mat->num_rows = m;
2692           mat->num_cols = A->cmap->n;
2693           mat->num_entries = nnz;
2694           mat->row_offsets = new THRUSTINTARRAY32(m+1);
2695           mat->row_offsets->assign(ii, ii + m+1);
2696 
2697           mat->column_indices = new THRUSTINTARRAY32(nnz);
2698           mat->column_indices->assign(a->j, a->j+nnz);
2699 
2700           mat->values = new THRUSTARRAY(nnz);
2701           if (a->a) mat->values->assign(a->a, a->a+nnz);
2702 
2703           cusparseHybMat_t hybMat;
2704           PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
2705           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
2706             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
2707           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
2708               matstruct->descr, mat->values->data().get(),
2709               mat->row_offsets->data().get(),
2710               mat->column_indices->data().get(),
2711               hybMat, 0, partition);PetscCallCUSPARSE(stat);
2712           /* assign the pointer */
2713           matstruct->mat = hybMat;
2714 
2715           if (mat) {
2716             if (mat->values) delete (THRUSTARRAY*)mat->values;
2717             if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
2718             if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
2719             delete (CsrMatrix*)mat;
2720           }
2721          #endif
2722         }
2723 
2724         /* assign the compressed row indices */
2725         if (a->compressedrow.use) {
2726           cusparsestruct->workVector = new THRUSTARRAY(m);
2727           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
2728           matstruct->cprowIndices->assign(ridx,ridx+m);
2729           tmp = m;
2730         } else {
2731           cusparsestruct->workVector = NULL;
2732           matstruct->cprowIndices    = NULL;
2733           tmp = 0;
2734         }
2735         PetscCall(PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar)));
2736 
2737         /* assign the pointer */
2738         cusparsestruct->mat = matstruct;
2739       } catch(char *ex) {
2740         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
2741       }
2742       PetscCallCUDA(WaitForCUDA());
2743       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0));
2744       cusparsestruct->nonzerostate = A->nonzerostate;
2745     }
2746     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
2747   }
2748   PetscFunctionReturn(0);
2749 }
2750 
2751 struct VecCUDAPlusEquals
2752 {
2753   template <typename Tuple>
2754   __host__ __device__
2755   void operator()(Tuple t)
2756   {
2757     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
2758   }
2759 };
2760 
2761 struct VecCUDAEquals
2762 {
2763   template <typename Tuple>
2764   __host__ __device__
2765   void operator()(Tuple t)
2766   {
2767     thrust::get<1>(t) = thrust::get<0>(t);
2768   }
2769 };
2770 
2771 struct VecCUDAEqualsReverse
2772 {
2773   template <typename Tuple>
2774   __host__ __device__
2775   void operator()(Tuple t)
2776   {
2777     thrust::get<0>(t) = thrust::get<1>(t);
2778   }
2779 };
2780 
2781 struct MatMatCusparse {
2782   PetscBool             cisdense;
2783   PetscScalar           *Bt;
2784   Mat                   X;
2785   PetscBool             reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
2786   PetscLogDouble        flops;
2787   CsrMatrix             *Bcsr;
2788 
2789 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2790   cusparseSpMatDescr_t  matSpBDescr;
2791   PetscBool             initialized;   /* C = alpha op(A) op(B) + beta C */
2792   cusparseDnMatDescr_t  matBDescr;
2793   cusparseDnMatDescr_t  matCDescr;
2794   PetscInt              Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
2795  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2796   void                  *dBuffer4;
2797   void                  *dBuffer5;
2798  #endif
2799   size_t                mmBufferSize;
2800   void                  *mmBuffer;
2801   void                  *mmBuffer2; /* SpGEMM WorkEstimation buffer */
2802   cusparseSpGEMMDescr_t spgemmDesc;
2803 #endif
2804 };
2805 
2806 static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
2807 {
2808   MatMatCusparse *mmdata = (MatMatCusparse *)data;
2809 
2810   PetscFunctionBegin;
2811   PetscCallCUDA(cudaFree(mmdata->Bt));
2812   delete mmdata->Bcsr;
2813  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2814   if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr));
2815   if (mmdata->matBDescr)   PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
2816   if (mmdata->matCDescr)   PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
2817   if (mmdata->spgemmDesc)  PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc));
2818  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2819   if (mmdata->dBuffer4)  PetscCallCUDA(cudaFree(mmdata->dBuffer4));
2820   if (mmdata->dBuffer5)  PetscCallCUDA(cudaFree(mmdata->dBuffer5));
2821  #endif
2822   if (mmdata->mmBuffer)  PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2823   if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2));
2824  #endif
2825   PetscCall(MatDestroy(&mmdata->X));
2826   PetscCall(PetscFree(data));
2827   PetscFunctionReturn(0);
2828 }
2829 
2830 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
2831 
2832 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2833 {
2834   Mat_Product                  *product = C->product;
2835   Mat                          A,B;
2836   PetscInt                     m,n,blda,clda;
2837   PetscBool                    flg,biscuda;
2838   Mat_SeqAIJCUSPARSE           *cusp;
2839   cusparseStatus_t             stat;
2840   cusparseOperation_t          opA;
2841   const PetscScalar            *barray;
2842   PetscScalar                  *carray;
2843   MatMatCusparse               *mmdata;
2844   Mat_SeqAIJCUSPARSEMultStruct *mat;
2845   CsrMatrix                    *csrmat;
2846 
2847   PetscFunctionBegin;
2848   MatCheckProduct(C,1);
2849   PetscCheck(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
2850   mmdata = (MatMatCusparse*)product->data;
2851   A    = product->A;
2852   B    = product->B;
2853   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
2854   PetscCheck(flg,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2855   /* currently CopyToGpu does not copy if the matrix is bound to CPU
2856      Instead of silently accepting the wrong answer, I prefer to raise the error */
2857   PetscCheck(!A->boundtocpu,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2858   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2859   cusp   = (Mat_SeqAIJCUSPARSE*)A->spptr;
2860   switch (product->type) {
2861   case MATPRODUCT_AB:
2862   case MATPRODUCT_PtAP:
2863     mat = cusp->mat;
2864     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2865     m   = A->rmap->n;
2866     n   = B->cmap->n;
2867     break;
2868   case MATPRODUCT_AtB:
2869     if (!A->form_explicit_transpose) {
2870       mat = cusp->mat;
2871       opA = CUSPARSE_OPERATION_TRANSPOSE;
2872     } else {
2873       PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2874       mat  = cusp->matTranspose;
2875       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
2876     }
2877     m = A->cmap->n;
2878     n = B->cmap->n;
2879     break;
2880   case MATPRODUCT_ABt:
2881   case MATPRODUCT_RARt:
2882     mat = cusp->mat;
2883     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2884     m   = A->rmap->n;
2885     n   = B->rmap->n;
2886     break;
2887   default:
2888     SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2889   }
2890   PetscCheck(mat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing Mat_SeqAIJCUSPARSEMultStruct");
2891   csrmat = (CsrMatrix*)mat->mat;
2892   /* if the user passed a CPU matrix, copy the data to the GPU */
2893   PetscCall(PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda));
2894   if (!biscuda) PetscCall(MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B));
2895   PetscCall(MatDenseCUDAGetArrayRead(B,&barray));
2896 
2897   PetscCall(MatDenseGetLDA(B,&blda));
2898   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2899     PetscCall(MatDenseCUDAGetArrayWrite(mmdata->X,&carray));
2900     PetscCall(MatDenseGetLDA(mmdata->X,&clda));
2901   } else {
2902     PetscCall(MatDenseCUDAGetArrayWrite(C,&carray));
2903     PetscCall(MatDenseGetLDA(C,&clda));
2904   }
2905 
2906   PetscCall(PetscLogGpuTimeBegin());
2907  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2908   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2909   /* (re)allocate mmBuffer if not initialized or LDAs are different */
2910   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2911     size_t mmBufferSize;
2912     if (mmdata->initialized && mmdata->Blda != blda) {PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); mmdata->matBDescr = NULL;}
2913     if (!mmdata->matBDescr) {
2914       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL));
2915       mmdata->Blda = blda;
2916     }
2917 
2918     if (mmdata->initialized && mmdata->Clda != clda) {PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); mmdata->matCDescr = NULL;}
2919     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2920       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL));
2921       mmdata->Clda = clda;
2922     }
2923 
2924     if (!mat->matDescr) {
2925       stat = cusparseCreateCsr(&mat->matDescr,
2926                                csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
2927                                csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
2928                                csrmat->values->data().get(),
2929                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2930                                CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);PetscCallCUSPARSE(stat);
2931     }
2932     stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
2933                                    mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2934                                    mmdata->matCDescr,cusparse_scalartype,
2935                                    cusp->spmmAlg,&mmBufferSize);PetscCallCUSPARSE(stat);
2936     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2937       PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2938       PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer,mmBufferSize));
2939       mmdata->mmBufferSize = mmBufferSize;
2940     }
2941     mmdata->initialized = PETSC_TRUE;
2942   } else {
2943     /* to be safe, always update pointers of the mats */
2944     PetscCallCUSPARSE(cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get()));
2945     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray));
2946     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray));
2947   }
2948 
2949   /* do cusparseSpMM, which supports transpose on B */
2950   stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
2951                       mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2952                       mmdata->matCDescr,cusparse_scalartype,
2953                       cusp->spmmAlg,mmdata->mmBuffer);PetscCallCUSPARSE(stat);
2954  #else
2955   PetscInt k;
2956   /* cusparseXcsrmm does not support transpose on B */
2957   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2958     cublasHandle_t cublasv2handle;
2959     cublasStatus_t cerr;
2960 
2961     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
2962     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
2963                        B->cmap->n,B->rmap->n,
2964                        &PETSC_CUSPARSE_ONE ,barray,blda,
2965                        &PETSC_CUSPARSE_ZERO,barray,blda,
2966                        mmdata->Bt,B->cmap->n);PetscCallCUBLAS(cerr);
2967     blda = B->cmap->n;
2968     k    = B->cmap->n;
2969   } else {
2970     k    = B->rmap->n;
2971   }
2972 
2973   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2974   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
2975                            csrmat->num_entries,mat->alpha_one,mat->descr,
2976                            csrmat->values->data().get(),
2977                            csrmat->row_offsets->data().get(),
2978                            csrmat->column_indices->data().get(),
2979                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
2980                            carray,clda);PetscCallCUSPARSE(stat);
2981  #endif
2982   PetscCall(PetscLogGpuTimeEnd());
2983   PetscCall(PetscLogGpuFlops(n*2.0*csrmat->num_entries));
2984   PetscCall(MatDenseCUDARestoreArrayRead(B,&barray));
2985   if (product->type == MATPRODUCT_RARt) {
2986     PetscCall(MatDenseCUDARestoreArrayWrite(mmdata->X,&carray));
2987     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE));
2988   } else if (product->type == MATPRODUCT_PtAP) {
2989     PetscCall(MatDenseCUDARestoreArrayWrite(mmdata->X,&carray));
2990     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE));
2991   } else {
2992     PetscCall(MatDenseCUDARestoreArrayWrite(C,&carray));
2993   }
2994   if (mmdata->cisdense) {
2995     PetscCall(MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C));
2996   }
2997   if (!biscuda) {
2998     PetscCall(MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B));
2999   }
3000   PetscFunctionReturn(0);
3001 }
3002 
3003 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
3004 {
3005   Mat_Product        *product = C->product;
3006   Mat                A,B;
3007   PetscInt           m,n;
3008   PetscBool          cisdense,flg;
3009   MatMatCusparse     *mmdata;
3010   Mat_SeqAIJCUSPARSE *cusp;
3011 
3012   PetscFunctionBegin;
3013   MatCheckProduct(C,1);
3014   PetscCheck(!C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
3015   A    = product->A;
3016   B    = product->B;
3017   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
3018   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
3019   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3020   PetscCheck(cusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
3021   switch (product->type) {
3022   case MATPRODUCT_AB:
3023     m = A->rmap->n;
3024     n = B->cmap->n;
3025     break;
3026   case MATPRODUCT_AtB:
3027     m = A->cmap->n;
3028     n = B->cmap->n;
3029     break;
3030   case MATPRODUCT_ABt:
3031     m = A->rmap->n;
3032     n = B->rmap->n;
3033     break;
3034   case MATPRODUCT_PtAP:
3035     m = B->cmap->n;
3036     n = B->cmap->n;
3037     break;
3038   case MATPRODUCT_RARt:
3039     m = B->rmap->n;
3040     n = B->rmap->n;
3041     break;
3042   default:
3043     SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
3044   }
3045   PetscCall(MatSetSizes(C,m,n,m,n));
3046   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
3047   PetscCall(PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense));
3048   PetscCall(MatSetType(C,MATSEQDENSECUDA));
3049 
3050   /* product data */
3051   PetscCall(PetscNew(&mmdata));
3052   mmdata->cisdense = cisdense;
3053  #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
3054   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
3055   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
3056     PetscCallCUDA(cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar)));
3057   }
3058  #endif
3059   /* for these products we need intermediate storage */
3060   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
3061     PetscCall(MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X));
3062     PetscCall(MatSetType(mmdata->X,MATSEQDENSECUDA));
3063     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
3064       PetscCall(MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n));
3065     } else {
3066       PetscCall(MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n));
3067     }
3068   }
3069   C->product->data    = mmdata;
3070   C->product->destroy = MatDestroy_MatMatCusparse;
3071 
3072   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
3073   PetscFunctionReturn(0);
3074 }
3075 
3076 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
3077 {
3078   Mat_Product                  *product = C->product;
3079   Mat                          A,B;
3080   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
3081   Mat_SeqAIJ                   *c = (Mat_SeqAIJ*)C->data;
3082   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
3083   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
3084   PetscBool                    flg;
3085   cusparseStatus_t             stat;
3086   MatProductType               ptype;
3087   MatMatCusparse               *mmdata;
3088 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3089   cusparseSpMatDescr_t         BmatSpDescr;
3090 #endif
3091   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
3092 
3093   PetscFunctionBegin;
3094   MatCheckProduct(C,1);
3095   PetscCheck(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
3096   PetscCall(PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg));
3097   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for C of type %s",((PetscObject)C)->type_name);
3098   mmdata = (MatMatCusparse*)C->product->data;
3099   A = product->A;
3100   B = product->B;
3101   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
3102     mmdata->reusesym = PETSC_FALSE;
3103     Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
3104     PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
3105     Cmat = Ccusp->mat;
3106     PetscCheck(Cmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]);
3107     Ccsr = (CsrMatrix*)Cmat->mat;
3108     PetscCheck(Ccsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
3109     goto finalize;
3110   }
3111   if (!c->nz) goto finalize;
3112   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
3113   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
3114   PetscCall(PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg));
3115   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
3116   PetscCheck(!A->boundtocpu,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
3117   PetscCheck(!B->boundtocpu,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
3118   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3119   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
3120   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
3121   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
3122   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
3123   PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
3124   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3125   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
3126 
3127   ptype = product->type;
3128   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
3129     ptype = MATPRODUCT_AB;
3130     PetscCheck(product->symbolic_used_the_fact_A_is_symmetric,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that A is symmetric");
3131   }
3132   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
3133     ptype = MATPRODUCT_AB;
3134     PetscCheck(product->symbolic_used_the_fact_B_is_symmetric,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that B is symmetric");
3135   }
3136   switch (ptype) {
3137   case MATPRODUCT_AB:
3138     Amat = Acusp->mat;
3139     Bmat = Bcusp->mat;
3140     break;
3141   case MATPRODUCT_AtB:
3142     Amat = Acusp->matTranspose;
3143     Bmat = Bcusp->mat;
3144     break;
3145   case MATPRODUCT_ABt:
3146     Amat = Acusp->mat;
3147     Bmat = Bcusp->matTranspose;
3148     break;
3149   default:
3150     SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
3151   }
3152   Cmat = Ccusp->mat;
3153   PetscCheck(Amat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
3154   PetscCheck(Bmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
3155   PetscCheck(Cmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[ptype]);
3156   Acsr = (CsrMatrix*)Amat->mat;
3157   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */
3158   Ccsr = (CsrMatrix*)Cmat->mat;
3159   PetscCheck(Acsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
3160   PetscCheck(Bcsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
3161   PetscCheck(Ccsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
3162   PetscCall(PetscLogGpuTimeBegin());
3163 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3164   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
3165   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3166   #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
3167     stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB,
3168                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
3169                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
3170                                mmdata->spgemmDesc);PetscCallCUSPARSE(stat);
3171   #else
3172     stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
3173                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
3174                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
3175                                mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);PetscCallCUSPARSE(stat);
3176     stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB,
3177                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
3178                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);PetscCallCUSPARSE(stat);
3179   #endif
3180 #else
3181   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB,
3182                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
3183                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
3184                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
3185                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());PetscCallCUSPARSE(stat);
3186 #endif
3187   PetscCall(PetscLogGpuFlops(mmdata->flops));
3188   PetscCallCUDA(WaitForCUDA());
3189   PetscCall(PetscLogGpuTimeEnd());
3190   C->offloadmask = PETSC_OFFLOAD_GPU;
3191 finalize:
3192   /* shorter version of MatAssemblyEnd_SeqAIJ */
3193   PetscCall(PetscInfo(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz));
3194   PetscCall(PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n"));
3195   PetscCall(PetscInfo(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax));
3196   c->reallocs         = 0;
3197   C->info.mallocs    += 0;
3198   C->info.nz_unneeded = 0;
3199   C->assembled = C->was_assembled = PETSC_TRUE;
3200   C->num_ass++;
3201   PetscFunctionReturn(0);
3202 }
3203 
3204 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
3205 {
3206   Mat_Product                  *product = C->product;
3207   Mat                          A,B;
3208   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
3209   Mat_SeqAIJ                   *a,*b,*c;
3210   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
3211   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
3212   PetscInt                     i,j,m,n,k;
3213   PetscBool                    flg;
3214   cusparseStatus_t             stat;
3215   MatProductType               ptype;
3216   MatMatCusparse               *mmdata;
3217   PetscLogDouble               flops;
3218   PetscBool                    biscompressed,ciscompressed;
3219 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3220   int64_t                      C_num_rows1, C_num_cols1, C_nnz1;
3221   cusparseSpMatDescr_t         BmatSpDescr;
3222 #else
3223   int                          cnz;
3224 #endif
3225   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
3226 
3227   PetscFunctionBegin;
3228   MatCheckProduct(C,1);
3229   PetscCheck(!C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
3230   A    = product->A;
3231   B    = product->B;
3232   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
3233   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
3234   PetscCall(PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg));
3235   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
3236   a = (Mat_SeqAIJ*)A->data;
3237   b = (Mat_SeqAIJ*)B->data;
3238   /* product data */
3239   PetscCall(PetscNew(&mmdata));
3240   C->product->data    = mmdata;
3241   C->product->destroy = MatDestroy_MatMatCusparse;
3242 
3243   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3244   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
3245   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */
3246   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
3247   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
3248   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
3249 
3250   ptype = product->type;
3251   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
3252     ptype = MATPRODUCT_AB;
3253     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
3254   }
3255   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
3256     ptype = MATPRODUCT_AB;
3257     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
3258   }
3259   biscompressed = PETSC_FALSE;
3260   ciscompressed = PETSC_FALSE;
3261   switch (ptype) {
3262   case MATPRODUCT_AB:
3263     m = A->rmap->n;
3264     n = B->cmap->n;
3265     k = A->cmap->n;
3266     Amat = Acusp->mat;
3267     Bmat = Bcusp->mat;
3268     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
3269     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
3270     break;
3271   case MATPRODUCT_AtB:
3272     m = A->cmap->n;
3273     n = B->cmap->n;
3274     k = A->rmap->n;
3275     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3276     Amat = Acusp->matTranspose;
3277     Bmat = Bcusp->mat;
3278     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
3279     break;
3280   case MATPRODUCT_ABt:
3281     m = A->rmap->n;
3282     n = B->rmap->n;
3283     k = A->cmap->n;
3284     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
3285     Amat = Acusp->mat;
3286     Bmat = Bcusp->matTranspose;
3287     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
3288     break;
3289   default:
3290     SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
3291   }
3292 
3293   /* create cusparse matrix */
3294   PetscCall(MatSetSizes(C,m,n,m,n));
3295   PetscCall(MatSetType(C,MATSEQAIJCUSPARSE));
3296   c     = (Mat_SeqAIJ*)C->data;
3297   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
3298   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
3299   Ccsr  = new CsrMatrix;
3300 
3301   c->compressedrow.use = ciscompressed;
3302   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
3303     c->compressedrow.nrows = a->compressedrow.nrows;
3304     PetscCall(PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex));
3305     PetscCall(PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows));
3306     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
3307     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
3308     Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows);
3309   } else {
3310     c->compressedrow.nrows  = 0;
3311     c->compressedrow.i      = NULL;
3312     c->compressedrow.rindex = NULL;
3313     Ccusp->workVector       = NULL;
3314     Cmat->cprowIndices      = NULL;
3315   }
3316   Ccusp->nrows    = ciscompressed ? c->compressedrow.nrows : m;
3317   Ccusp->mat      = Cmat;
3318   Ccusp->mat->mat = Ccsr;
3319   Ccsr->num_rows    = Ccusp->nrows;
3320   Ccsr->num_cols    = n;
3321   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1);
3322   PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
3323   PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
3324   PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
3325   PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar)));
3326   PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar)));
3327   PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar)));
3328   PetscCallCUDA(cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
3329   PetscCallCUDA(cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice));
3330   PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
3331   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
3332     thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0);
3333     c->nz = 0;
3334     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3335     Ccsr->values = new THRUSTARRAY(c->nz);
3336     goto finalizesym;
3337   }
3338 
3339   PetscCheck(Amat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
3340   PetscCheck(Bmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
3341   Acsr = (CsrMatrix*)Amat->mat;
3342   if (!biscompressed) {
3343     Bcsr = (CsrMatrix*)Bmat->mat;
3344 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3345     BmatSpDescr = Bmat->matDescr;
3346 #endif
3347   } else { /* we need to use row offsets for the full matrix */
3348     CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat;
3349     Bcsr = new CsrMatrix;
3350     Bcsr->num_rows       = B->rmap->n;
3351     Bcsr->num_cols       = cBcsr->num_cols;
3352     Bcsr->num_entries    = cBcsr->num_entries;
3353     Bcsr->column_indices = cBcsr->column_indices;
3354     Bcsr->values         = cBcsr->values;
3355     if (!Bcusp->rowoffsets_gpu) {
3356       Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
3357       Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
3358       PetscCall(PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt)));
3359     }
3360     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
3361     mmdata->Bcsr = Bcsr;
3362 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3363     if (Bcsr->num_rows && Bcsr->num_cols) {
3364       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries,
3365                                Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
3366                                Bcsr->values->data().get(),
3367                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
3368                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat);
3369     }
3370     BmatSpDescr = mmdata->matSpBDescr;
3371 #endif
3372   }
3373   PetscCheck(Acsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
3374   PetscCheck(Bcsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
3375   /* precompute flops count */
3376   if (ptype == MATPRODUCT_AB) {
3377     for (i=0, flops = 0; i<A->rmap->n; i++) {
3378       const PetscInt st = a->i[i];
3379       const PetscInt en = a->i[i+1];
3380       for (j=st; j<en; j++) {
3381         const PetscInt brow = a->j[j];
3382         flops += 2.*(b->i[brow+1] - b->i[brow]);
3383       }
3384     }
3385   } else if (ptype == MATPRODUCT_AtB) {
3386     for (i=0, flops = 0; i<A->rmap->n; i++) {
3387       const PetscInt anzi = a->i[i+1] - a->i[i];
3388       const PetscInt bnzi = b->i[i+1] - b->i[i];
3389       flops += (2.*anzi)*bnzi;
3390     }
3391   } else { /* TODO */
3392     flops = 0.;
3393   }
3394 
3395   mmdata->flops = flops;
3396   PetscCall(PetscLogGpuTimeBegin());
3397 
3398 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3399   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3400   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0,
3401                           NULL, NULL, NULL,
3402                           CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
3403                           CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat);
3404   PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc));
3405  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
3406  {
3407   /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
3408      We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
3409   */
3410   void*  dBuffer1 = NULL;
3411   void*  dBuffer2 = NULL;
3412   void*  dBuffer3 = NULL;
3413   /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
3414   size_t bufferSize1 = 0;
3415   size_t bufferSize2 = 0;
3416   size_t bufferSize3 = 0;
3417   size_t bufferSize4 = 0;
3418   size_t bufferSize5 = 0;
3419 
3420   /*----------------------------------------------------------------------*/
3421   /* ask bufferSize1 bytes for external memory */
3422   stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
3423                                             CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
3424                                             &bufferSize1, NULL);PetscCallCUSPARSE(stat);
3425   PetscCallCUDA(cudaMalloc((void**) &dBuffer1, bufferSize1));
3426   /* inspect the matrices A and B to understand the memory requirement for the next step */
3427   stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
3428                                             CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
3429                                             &bufferSize1, dBuffer1);PetscCallCUSPARSE(stat);
3430 
3431   /*----------------------------------------------------------------------*/
3432   stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
3433                                  CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
3434                                  &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);PetscCallCUSPARSE(stat);
3435   PetscCallCUDA(cudaMalloc((void**) &dBuffer2, bufferSize2));
3436   PetscCallCUDA(cudaMalloc((void**) &dBuffer3, bufferSize3));
3437   PetscCallCUDA(cudaMalloc((void**) &mmdata->dBuffer4, bufferSize4));
3438   stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
3439                                  CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
3440                                  &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);PetscCallCUSPARSE(stat);
3441   PetscCallCUDA(cudaFree(dBuffer1));
3442   PetscCallCUDA(cudaFree(dBuffer2));
3443 
3444   /*----------------------------------------------------------------------*/
3445   /* get matrix C non-zero entries C_nnz1 */
3446   PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
3447   c->nz = (PetscInt) C_nnz1;
3448   /* allocate matrix C */
3449   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3450   Ccsr->values         = new THRUSTARRAY(c->nz);PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3451   /* update matC with the new pointers */
3452   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
3453                                 Ccsr->values->data().get());PetscCallCUSPARSE(stat);
3454 
3455   /*----------------------------------------------------------------------*/
3456   stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
3457                                   CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
3458                                   &bufferSize5, NULL);PetscCallCUSPARSE(stat);
3459   PetscCallCUDA(cudaMalloc((void**) &mmdata->dBuffer5, bufferSize5));
3460   stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
3461                                   CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
3462                                   &bufferSize5, mmdata->dBuffer5);PetscCallCUSPARSE(stat);
3463   PetscCallCUDA(cudaFree(dBuffer3));
3464   stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB,
3465                                      Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
3466                                      cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
3467                                      mmdata->spgemmDesc);PetscCallCUSPARSE(stat);
3468   PetscCall(PetscInfo(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufferSize4/1024,bufferSize5/1024));
3469  }
3470  #else
3471   size_t bufSize2;
3472   /* ask bufferSize bytes for external memory */
3473   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB,
3474                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
3475                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
3476                                        mmdata->spgemmDesc, &bufSize2, NULL);PetscCallCUSPARSE(stat);
3477   PetscCallCUDA(cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2));
3478   /* inspect the matrices A and B to understand the memory requirement for the next step */
3479   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB,
3480                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
3481                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
3482                                        mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);PetscCallCUSPARSE(stat);
3483   /* ask bufferSize again bytes for external memory */
3484   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
3485                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
3486                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
3487                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);PetscCallCUSPARSE(stat);
3488   /* The CUSPARSE documentation is not clear, nor the API
3489      We need both buffers to perform the operations properly!
3490      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
3491      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
3492      is stored in the descriptor! What a messy API... */
3493   PetscCallCUDA(cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize));
3494   /* compute the intermediate product of A * B */
3495   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
3496                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
3497                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
3498                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);PetscCallCUSPARSE(stat);
3499   /* get matrix C non-zero entries C_nnz1 */
3500   PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
3501   c->nz = (PetscInt) C_nnz1;
3502   PetscCall(PetscInfo(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024));
3503   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3504   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3505   Ccsr->values = new THRUSTARRAY(c->nz);
3506   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3507   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
3508                                 Ccsr->values->data().get());PetscCallCUSPARSE(stat);
3509   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB,
3510                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
3511                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);PetscCallCUSPARSE(stat);
3512  #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0)
3513 #else
3514   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST));
3515   stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB,
3516                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
3517                              Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
3518                              Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
3519                              Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);PetscCallCUSPARSE(stat);
3520   c->nz = cnz;
3521   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3522   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3523   Ccsr->values = new THRUSTARRAY(c->nz);
3524   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3525 
3526   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3527   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
3528      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
3529      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
3530   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB,
3531                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
3532                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
3533                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
3534                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());PetscCallCUSPARSE(stat);
3535 #endif
3536   PetscCall(PetscLogGpuFlops(mmdata->flops));
3537   PetscCall(PetscLogGpuTimeEnd());
3538 finalizesym:
3539   c->singlemalloc = PETSC_FALSE;
3540   c->free_a       = PETSC_TRUE;
3541   c->free_ij      = PETSC_TRUE;
3542   PetscCall(PetscMalloc1(m+1,&c->i));
3543   PetscCall(PetscMalloc1(c->nz,&c->j));
3544   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
3545     PetscInt *d_i = c->i;
3546     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
3547     THRUSTINTARRAY jj(Ccsr->column_indices->size());
3548     ii   = *Ccsr->row_offsets;
3549     jj   = *Ccsr->column_indices;
3550     if (ciscompressed) d_i = c->compressedrow.i;
3551     PetscCallCUDA(cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
3552     PetscCallCUDA(cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
3553   } else {
3554     PetscInt *d_i = c->i;
3555     if (ciscompressed) d_i = c->compressedrow.i;
3556     PetscCallCUDA(cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
3557     PetscCallCUDA(cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
3558   }
3559   if (ciscompressed) { /* need to expand host row offsets */
3560     PetscInt r = 0;
3561     c->i[0] = 0;
3562     for (k = 0; k < c->compressedrow.nrows; k++) {
3563       const PetscInt next = c->compressedrow.rindex[k];
3564       const PetscInt old = c->compressedrow.i[k];
3565       for (; r < next; r++) c->i[r+1] = old;
3566     }
3567     for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows];
3568   }
3569   PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt)));
3570   PetscCall(PetscMalloc1(m,&c->ilen));
3571   PetscCall(PetscMalloc1(m,&c->imax));
3572   c->maxnz = c->nz;
3573   c->nonzerorowcnt = 0;
3574   c->rmax = 0;
3575   for (k = 0; k < m; k++) {
3576     const PetscInt nn = c->i[k+1] - c->i[k];
3577     c->ilen[k] = c->imax[k] = nn;
3578     c->nonzerorowcnt += (PetscInt)!!nn;
3579     c->rmax = PetscMax(c->rmax,nn);
3580   }
3581   PetscCall(MatMarkDiagonal_SeqAIJ(C));
3582   PetscCall(PetscMalloc1(c->nz,&c->a));
3583   Ccsr->num_entries = c->nz;
3584 
3585   C->nonzerostate++;
3586   PetscCall(PetscLayoutSetUp(C->rmap));
3587   PetscCall(PetscLayoutSetUp(C->cmap));
3588   Ccusp->nonzerostate = C->nonzerostate;
3589   C->offloadmask   = PETSC_OFFLOAD_UNALLOCATED;
3590   C->preallocated  = PETSC_TRUE;
3591   C->assembled     = PETSC_FALSE;
3592   C->was_assembled = PETSC_FALSE;
3593   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
3594     mmdata->reusesym = PETSC_TRUE;
3595     C->offloadmask   = PETSC_OFFLOAD_GPU;
3596   }
3597   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3598   PetscFunctionReturn(0);
3599 }
3600 
3601 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
3602 
3603 /* handles sparse or dense B */
3604 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
3605 {
3606   Mat_Product    *product = mat->product;
3607   PetscBool      isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE;
3608 
3609   PetscFunctionBegin;
3610   MatCheckProduct(mat,1);
3611   PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense));
3612   if (!product->A->boundtocpu && !product->B->boundtocpu) {
3613     PetscCall(PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp));
3614   }
3615   if (product->type == MATPRODUCT_ABC) {
3616     Ciscusp = PETSC_FALSE;
3617     if (!product->C->boundtocpu) {
3618       PetscCall(PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp));
3619     }
3620   }
3621   if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
3622     PetscBool usecpu = PETSC_FALSE;
3623     switch (product->type) {
3624     case MATPRODUCT_AB:
3625       if (product->api_user) {
3626         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMult","Mat");
3627         PetscCall(PetscOptionsBool("-matmatmult_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL));
3628         PetscOptionsEnd();
3629       } else {
3630         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AB","Mat");
3631         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL));
3632         PetscOptionsEnd();
3633       }
3634       break;
3635     case MATPRODUCT_AtB:
3636       if (product->api_user) {
3637         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatTransposeMatMult","Mat");
3638         PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL));
3639         PetscOptionsEnd();
3640       } else {
3641         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AtB","Mat");
3642         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL));
3643         PetscOptionsEnd();
3644       }
3645       break;
3646     case MATPRODUCT_PtAP:
3647       if (product->api_user) {
3648         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatPtAP","Mat");
3649         PetscCall(PetscOptionsBool("-matptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL));
3650         PetscOptionsEnd();
3651       } else {
3652         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_PtAP","Mat");
3653         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL));
3654         PetscOptionsEnd();
3655       }
3656       break;
3657     case MATPRODUCT_RARt:
3658       if (product->api_user) {
3659         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatRARt","Mat");
3660         PetscCall(PetscOptionsBool("-matrart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL));
3661         PetscOptionsEnd();
3662       } else {
3663         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_RARt","Mat");
3664         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL));
3665         PetscOptionsEnd();
3666       }
3667       break;
3668     case MATPRODUCT_ABC:
3669       if (product->api_user) {
3670         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMatMult","Mat");
3671         PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL));
3672         PetscOptionsEnd();
3673       } else {
3674         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_ABC","Mat");
3675         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL));
3676         PetscOptionsEnd();
3677       }
3678       break;
3679     default:
3680       break;
3681     }
3682     if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
3683   }
3684   /* dispatch */
3685   if (isdense) {
3686     switch (product->type) {
3687     case MATPRODUCT_AB:
3688     case MATPRODUCT_AtB:
3689     case MATPRODUCT_ABt:
3690     case MATPRODUCT_PtAP:
3691     case MATPRODUCT_RARt:
3692      if (product->A->boundtocpu) {
3693         PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat));
3694       } else {
3695         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
3696       }
3697       break;
3698     case MATPRODUCT_ABC:
3699       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3700       break;
3701     default:
3702       break;
3703     }
3704   } else if (Biscusp && Ciscusp) {
3705     switch (product->type) {
3706     case MATPRODUCT_AB:
3707     case MATPRODUCT_AtB:
3708     case MATPRODUCT_ABt:
3709       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3710       break;
3711     case MATPRODUCT_PtAP:
3712     case MATPRODUCT_RARt:
3713     case MATPRODUCT_ABC:
3714       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3715       break;
3716     default:
3717       break;
3718     }
3719   } else { /* fallback for AIJ */
3720     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
3721   }
3722   PetscFunctionReturn(0);
3723 }
3724 
3725 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
3726 {
3727   PetscFunctionBegin;
3728   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE));
3729   PetscFunctionReturn(0);
3730 }
3731 
3732 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
3733 {
3734   PetscFunctionBegin;
3735   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE));
3736   PetscFunctionReturn(0);
3737 }
3738 
3739 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
3740 {
3741   PetscFunctionBegin;
3742   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE));
3743   PetscFunctionReturn(0);
3744 }
3745 
3746 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
3747 {
3748   PetscFunctionBegin;
3749   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE));
3750   PetscFunctionReturn(0);
3751 }
3752 
3753 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
3754 {
3755   PetscFunctionBegin;
3756   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE));
3757   PetscFunctionReturn(0);
3758 }
3759 
3760 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y)
3761 {
3762   int i = blockIdx.x*blockDim.x + threadIdx.x;
3763   if (i < n) y[idx[i]] += x[i];
3764 }
3765 
3766 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
3767 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
3768 {
3769   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
3770   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
3771   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
3772   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
3773   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
3774   PetscBool                    compressed;
3775 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3776   PetscInt                     nx,ny;
3777 #endif
3778 
3779   PetscFunctionBegin;
3780   PetscCheck(!herm || trans,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Hermitian and not transpose not supported");
3781   if (!a->nz) {
3782     if (!yy) PetscCall(VecSet_SeqCUDA(zz,0));
3783     else PetscCall(VecCopy_SeqCUDA(yy,zz));
3784     PetscFunctionReturn(0);
3785   }
3786   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
3787   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3788   if (!trans) {
3789     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
3790     PetscCheck(matstruct,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
3791   } else {
3792     if (herm || !A->form_explicit_transpose) {
3793       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
3794       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
3795     } else {
3796       if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3797       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
3798     }
3799   }
3800   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3801   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
3802 
3803   try {
3804     PetscCall(VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray));
3805     if (yy == zz) PetscCall(VecCUDAGetArray(zz,&zarray)); /* read & write zz, so need to get uptodate zarray on GPU */
3806     else PetscCall(VecCUDAGetArrayWrite(zz,&zarray)); /* write zz, so no need to init zarray on GPU */
3807 
3808     PetscCall(PetscLogGpuTimeBegin());
3809     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3810       /* z = A x + beta y.
3811          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3812          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3813       */
3814       xptr = xarray;
3815       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
3816       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3817      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3818       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3819           allocated to accommodate different uses. So we get the length info directly from mat.
3820        */
3821       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3822         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3823         nx = mat->num_cols;
3824         ny = mat->num_rows;
3825       }
3826      #endif
3827     } else {
3828       /* z = A^T x + beta y
3829          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3830          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3831        */
3832       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
3833       dptr = zarray;
3834       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3835       if (compressed) { /* Scatter x to work vector */
3836         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3837         thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
3838                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3839                          VecCUDAEqualsReverse());
3840       }
3841      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3842       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3843         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3844         nx = mat->num_rows;
3845         ny = mat->num_cols;
3846       }
3847      #endif
3848     }
3849 
3850     /* csr_spmv does y = alpha op(A) x + beta y */
3851     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3852      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3853       PetscCheck(opA >= 0 && opA <= 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
3854       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
3855         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype));
3856         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype));
3857         PetscCallCUSPARSE(cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
3858                                                matstruct->matDescr,
3859                                                matstruct->cuSpMV[opA].vecXDescr, beta,
3860                                                matstruct->cuSpMV[opA].vecYDescr,
3861                                                cusparse_scalartype,
3862                                                cusparsestruct->spmvAlg,
3863                                                &matstruct->cuSpMV[opA].spmvBufferSize));
3864         PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize));
3865 
3866         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3867       } else {
3868         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
3869         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr));
3870         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr));
3871       }
3872 
3873       PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA,
3874                                   matstruct->alpha_one,
3875                                   matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */
3876                                   matstruct->cuSpMV[opA].vecXDescr,
3877                                   beta,
3878                                   matstruct->cuSpMV[opA].vecYDescr,
3879                                   cusparse_scalartype,
3880                                   cusparsestruct->spmvAlg,
3881                                   matstruct->cuSpMV[opA].spmvBuffer));
3882      #else
3883       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3884       PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA,
3885                                        mat->num_rows, mat->num_cols,
3886                                        mat->num_entries, matstruct->alpha_one, matstruct->descr,
3887                                        mat->values->data().get(), mat->row_offsets->data().get(),
3888                                        mat->column_indices->data().get(), xptr, beta,
3889                                        dptr));
3890      #endif
3891     } else {
3892       if (cusparsestruct->nrows) {
3893        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3894         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3895        #else
3896         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
3897         PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA,
3898                                          matstruct->alpha_one, matstruct->descr, hybMat,
3899                                          xptr, beta,
3900                                          dptr));
3901        #endif
3902       }
3903     }
3904     PetscCall(PetscLogGpuTimeEnd());
3905 
3906     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3907       if (yy) { /* MatMultAdd: zz = A*xx + yy */
3908         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3909           PetscCall(VecCopy_SeqCUDA(yy,zz)); /* zz = yy */
3910         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3911           PetscCall(VecAXPY_SeqCUDA(zz,1.0,yy)); /* zz += yy */
3912         }
3913       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3914         PetscCall(VecSet_SeqCUDA(zz,0));
3915       }
3916 
3917       /* ScatterAdd the result from work vector into the full vector when A is compressed */
3918       if (compressed) {
3919         PetscCall(PetscLogGpuTimeBegin());
3920         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred)
3921            and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
3922            prevent that. So I just add a ScatterAdd kernel.
3923          */
3924        #if 0
3925         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
3926         thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
3927                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
3928                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3929                          VecCUDAPlusEquals());
3930        #else
3931         PetscInt n = matstruct->cprowIndices->size();
3932         ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray);
3933        #endif
3934         PetscCall(PetscLogGpuTimeEnd());
3935       }
3936     } else {
3937       if (yy && yy != zz) {
3938         PetscCall(VecAXPY_SeqCUDA(zz,1.0,yy)); /* zz += yy */
3939       }
3940     }
3941     PetscCall(VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray));
3942     if (yy == zz) PetscCall(VecCUDARestoreArray(zz,&zarray));
3943     else PetscCall(VecCUDARestoreArrayWrite(zz,&zarray));
3944   } catch(char *ex) {
3945     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
3946   }
3947   if (yy) {
3948     PetscCall(PetscLogGpuFlops(2.0*a->nz));
3949   } else {
3950     PetscCall(PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt));
3951   }
3952   PetscFunctionReturn(0);
3953 }
3954 
3955 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
3956 {
3957   PetscFunctionBegin;
3958   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE));
3959   PetscFunctionReturn(0);
3960 }
3961 
3962 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
3963 {
3964   PetscObjectState   onnz = A->nonzerostate;
3965   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3966 
3967   PetscFunctionBegin;
3968   PetscCall(MatAssemblyEnd_SeqAIJ(A,mode));
3969   if (onnz != A->nonzerostate && cusp->deviceMat) {
3970 
3971     PetscCall(PetscInfo(A,"Destroy device mat since nonzerostate changed\n"));
3972     PetscCallCUDA(cudaFree(cusp->deviceMat));
3973     cusp->deviceMat = NULL;
3974   }
3975   PetscFunctionReturn(0);
3976 }
3977 
3978 /* --------------------------------------------------------------------------------*/
3979 /*@
3980    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
3981    (the default parallel PETSc format). This matrix will ultimately pushed down
3982    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
3983    assembly performance the user should preallocate the matrix storage by setting
3984    the parameter nz (or the array nnz).  By setting these parameters accurately,
3985    performance during matrix assembly can be increased by more than a factor of 50.
3986 
3987    Collective
3988 
3989    Input Parameters:
3990 +  comm - MPI communicator, set to PETSC_COMM_SELF
3991 .  m - number of rows
3992 .  n - number of columns
3993 .  nz - number of nonzeros per row (same for all rows)
3994 -  nnz - array containing the number of nonzeros in the various rows
3995          (possibly different for each row) or NULL
3996 
3997    Output Parameter:
3998 .  A - the matrix
3999 
4000    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
4001    MatXXXXSetPreallocation() paradgm instead of this routine directly.
4002    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
4003 
4004    Notes:
4005    If nnz is given then nz is ignored
4006 
4007    The AIJ format (also called the Yale sparse matrix format or
4008    compressed row storage), is fully compatible with standard Fortran 77
4009    storage.  That is, the stored row and column indices can begin at
4010    either one (as in Fortran) or zero.  See the users' manual for details.
4011 
4012    Specify the preallocated storage with either nz or nnz (not both).
4013    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
4014    allocation.  For large problems you MUST preallocate memory or you
4015    will get TERRIBLE performance, see the users' manual chapter on matrices.
4016 
4017    By default, this format uses inodes (identical nodes) when possible, to
4018    improve numerical efficiency of matrix-vector products and solves. We
4019    search for consecutive rows with the same nonzero structure, thereby
4020    reusing matrix information to achieve increased efficiency.
4021 
4022    Level: intermediate
4023 
4024 .seealso: `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATSEQAIJCUSPARSE`, `MATAIJCUSPARSE`
4025 @*/
4026 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
4027 {
4028   PetscFunctionBegin;
4029   PetscCall(MatCreate(comm,A));
4030   PetscCall(MatSetSizes(*A,m,n,m,n));
4031   PetscCall(MatSetType(*A,MATSEQAIJCUSPARSE));
4032   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz));
4033   PetscFunctionReturn(0);
4034 }
4035 
4036 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
4037 {
4038   PetscFunctionBegin;
4039   if (A->factortype == MAT_FACTOR_NONE) {
4040     PetscCall(MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr));
4041   } else {
4042     PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr));
4043   }
4044   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL));
4045   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL));
4046   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetUseCPUSolve_C",NULL));
4047   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL));
4048   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL));
4049   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL));
4050   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL));
4051   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL));
4052   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL));
4053   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaijcusparse_hypre_C",NULL));
4054   PetscCall(MatDestroy_SeqAIJ(A));
4055   PetscFunctionReturn(0);
4056 }
4057 
4058 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
4059 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
4060 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
4061 {
4062   PetscFunctionBegin;
4063   PetscCall(MatDuplicate_SeqAIJ(A,cpvalues,B));
4064   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B));
4065   PetscFunctionReturn(0);
4066 }
4067 
4068 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str)
4069 {
4070   Mat_SeqAIJ         *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
4071   Mat_SeqAIJCUSPARSE *cy;
4072   Mat_SeqAIJCUSPARSE *cx;
4073   PetscScalar        *ay;
4074   const PetscScalar  *ax;
4075   CsrMatrix          *csry,*csrx;
4076 
4077   PetscFunctionBegin;
4078   cy = (Mat_SeqAIJCUSPARSE*)Y->spptr;
4079   cx = (Mat_SeqAIJCUSPARSE*)X->spptr;
4080   if (X->ops->axpy != Y->ops->axpy) {
4081     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE));
4082     PetscCall(MatAXPY_SeqAIJ(Y,a,X,str));
4083     PetscFunctionReturn(0);
4084   }
4085   /* if we are here, it means both matrices are bound to GPU */
4086   PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y));
4087   PetscCall(MatSeqAIJCUSPARSECopyToGPU(X));
4088   PetscCheck(cy->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)Y),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
4089   PetscCheck(cx->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)X),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
4090   csry = (CsrMatrix*)cy->mat->mat;
4091   csrx = (CsrMatrix*)cx->mat->mat;
4092   /* see if we can turn this into a cublas axpy */
4093   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
4094     bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin());
4095     if (eq) {
4096       eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin());
4097     }
4098     if (eq) str = SAME_NONZERO_PATTERN;
4099   }
4100   /* spgeam is buggy with one column */
4101   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
4102 
4103   if (str == SUBSET_NONZERO_PATTERN) {
4104     PetscScalar b = 1.0;
4105 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4106     size_t      bufferSize;
4107     void        *buffer;
4108 #endif
4109 
4110     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X,&ax));
4111     PetscCall(MatSeqAIJCUSPARSEGetArray(Y,&ay));
4112     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST));
4113 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4114     PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n,
4115                                                   &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
4116                                                   &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
4117                                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize));
4118     PetscCallCUDA(cudaMalloc(&buffer,bufferSize));
4119     PetscCall(PetscLogGpuTimeBegin());
4120     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
4121                                        &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
4122                                        &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
4123                                        cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer));
4124     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
4125     PetscCall(PetscLogGpuTimeEnd());
4126     PetscCallCUDA(cudaFree(buffer));
4127 #else
4128     PetscCall(PetscLogGpuTimeBegin());
4129     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
4130                                        &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
4131                                        &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
4132                                        cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get()));
4133     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
4134     PetscCall(PetscLogGpuTimeEnd());
4135 #endif
4136     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE));
4137     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X,&ax));
4138     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y,&ay));
4139     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
4140   } else if (str == SAME_NONZERO_PATTERN) {
4141     cublasHandle_t cublasv2handle;
4142     PetscBLASInt   one = 1, bnz = 1;
4143 
4144     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X,&ax));
4145     PetscCall(MatSeqAIJCUSPARSEGetArray(Y,&ay));
4146     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
4147     PetscCall(PetscBLASIntCast(x->nz,&bnz));
4148     PetscCall(PetscLogGpuTimeBegin());
4149     PetscCallCUBLAS(cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one));
4150     PetscCall(PetscLogGpuFlops(2.0*bnz));
4151     PetscCall(PetscLogGpuTimeEnd());
4152     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X,&ax));
4153     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y,&ay));
4154     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
4155   } else {
4156     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE));
4157     PetscCall(MatAXPY_SeqAIJ(Y,a,X,str));
4158   }
4159   PetscFunctionReturn(0);
4160 }
4161 
4162 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a)
4163 {
4164   Mat_SeqAIJ     *y = (Mat_SeqAIJ*)Y->data;
4165   PetscScalar    *ay;
4166   cublasHandle_t cublasv2handle;
4167   PetscBLASInt   one = 1, bnz = 1;
4168 
4169   PetscFunctionBegin;
4170   PetscCall(MatSeqAIJCUSPARSEGetArray(Y,&ay));
4171   PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
4172   PetscCall(PetscBLASIntCast(y->nz,&bnz));
4173   PetscCall(PetscLogGpuTimeBegin());
4174   PetscCallCUBLAS(cublasXscal(cublasv2handle,bnz,&a,ay,one));
4175   PetscCall(PetscLogGpuFlops(bnz));
4176   PetscCall(PetscLogGpuTimeEnd());
4177   PetscCall(MatSeqAIJCUSPARSERestoreArray(Y,&ay));
4178   PetscCall(MatSeqAIJInvalidateDiagonal(Y));
4179   PetscFunctionReturn(0);
4180 }
4181 
4182 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
4183 {
4184   PetscBool      both = PETSC_FALSE;
4185   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
4186 
4187   PetscFunctionBegin;
4188   if (A->factortype == MAT_FACTOR_NONE) {
4189     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
4190     if (spptr->mat) {
4191       CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat;
4192       if (matrix->values) {
4193         both = PETSC_TRUE;
4194         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
4195       }
4196     }
4197     if (spptr->matTranspose) {
4198       CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat;
4199       if (matrix->values) {
4200         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
4201       }
4202     }
4203   }
4204   PetscCall(PetscArrayzero(a->a,a->i[A->rmap->n]));
4205   PetscCall(MatSeqAIJInvalidateDiagonal(A));
4206   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
4207   else A->offloadmask = PETSC_OFFLOAD_CPU;
4208   PetscFunctionReturn(0);
4209 }
4210 
4211 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
4212 {
4213   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
4214 
4215   PetscFunctionBegin;
4216   if (A->factortype != MAT_FACTOR_NONE) {
4217     A->boundtocpu = flg;
4218     PetscFunctionReturn(0);
4219   }
4220   if (flg) {
4221     PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
4222 
4223     A->ops->scale                     = MatScale_SeqAIJ;
4224     A->ops->axpy                      = MatAXPY_SeqAIJ;
4225     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
4226     A->ops->mult                      = MatMult_SeqAIJ;
4227     A->ops->multadd                   = MatMultAdd_SeqAIJ;
4228     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
4229     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
4230     A->ops->multhermitiantranspose    = NULL;
4231     A->ops->multhermitiantransposeadd = NULL;
4232     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
4233     PetscCall(PetscMemzero(a->ops,sizeof(Mat_SeqAIJOps)));
4234     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL));
4235     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL));
4236     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL));
4237     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL));
4238     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL));
4239     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL));
4240   } else {
4241     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
4242     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
4243     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
4244     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
4245     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
4246     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
4247     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
4248     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
4249     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
4250     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
4251     a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJCUSPARSE;
4252     a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJCUSPARSE;
4253     a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE;
4254     a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE;
4255     a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE;
4256     a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE;
4257     a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE;
4258 
4259     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE));
4260     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE));
4261     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE));
4262     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE));
4263     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE));
4264     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE));
4265    }
4266   A->boundtocpu = flg;
4267   if (flg && a->inode.size) {
4268     a->inode.use = PETSC_TRUE;
4269   } else {
4270     a->inode.use = PETSC_FALSE;
4271   }
4272   PetscFunctionReturn(0);
4273 }
4274 
4275 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
4276 {
4277   Mat              B;
4278 
4279   PetscFunctionBegin;
4280   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */
4281   if (reuse == MAT_INITIAL_MATRIX) {
4282     PetscCall(MatDuplicate(A,MAT_COPY_VALUES,newmat));
4283   } else if (reuse == MAT_REUSE_MATRIX) {
4284     PetscCall(MatCopy(A,*newmat,SAME_NONZERO_PATTERN));
4285   }
4286   B = *newmat;
4287 
4288   PetscCall(PetscFree(B->defaultvectype));
4289   PetscCall(PetscStrallocpy(VECCUDA,&B->defaultvectype));
4290 
4291   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
4292     if (B->factortype == MAT_FACTOR_NONE) {
4293       Mat_SeqAIJCUSPARSE *spptr;
4294       PetscCall(PetscNew(&spptr));
4295       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
4296       PetscCallCUSPARSE(cusparseSetStream(spptr->handle,PetscDefaultCudaStream));
4297       spptr->format     = MAT_CUSPARSE_CSR;
4298      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4299      #if CUSPARSE_VERSION > 11301
4300       spptr->spmvAlg    = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
4301      #else
4302       spptr->spmvAlg    = CUSPARSE_CSRMV_ALG1;    /* default, since we only support csr */
4303      #endif
4304       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
4305       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
4306      #endif
4307       B->spptr = spptr;
4308     } else {
4309       Mat_SeqAIJCUSPARSETriFactors *spptr;
4310 
4311       PetscCall(PetscNew(&spptr));
4312       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
4313       PetscCallCUSPARSE(cusparseSetStream(spptr->handle,PetscDefaultCudaStream));
4314       B->spptr = spptr;
4315     }
4316     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
4317   }
4318   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
4319   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
4320   B->ops->setoption      = MatSetOption_SeqAIJCUSPARSE;
4321   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
4322   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
4323   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
4324 
4325   PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE));
4326   PetscCall(PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE));
4327   PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE));
4328 #if defined(PETSC_HAVE_HYPRE)
4329   PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijcusparse_hypre_C",MatConvert_AIJ_HYPRE));
4330 #endif
4331   PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetUseCPUSolve_C",MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE));
4332   PetscFunctionReturn(0);
4333 }
4334 
4335 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
4336 {
4337   PetscFunctionBegin;
4338   PetscCall(MatCreate_SeqAIJ(B));
4339   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B));
4340   PetscFunctionReturn(0);
4341 }
4342 
4343 /*MC
4344    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
4345 
4346    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
4347    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
4348    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
4349 
4350    Options Database Keys:
4351 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
4352 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
4353 -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
4354 +  -mat_cusparse_use_cpu_solve - Do MatSolve on CPU
4355 
4356   Level: beginner
4357 
4358 .seealso: `MatCreateSeqAIJCUSPARSE()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
4359 M*/
4360 
4361 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*);
4362 
4363 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
4364 {
4365   PetscFunctionBegin;
4366   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSEBAND,MATSEQAIJ,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band));
4367   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse));
4368   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse));
4369   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse));
4370   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse));
4371 
4372   PetscFunctionReturn(0);
4373 }
4374 
4375 static PetscErrorCode MatResetPreallocationCOO_SeqAIJCUSPARSE(Mat mat)
4376 {
4377   Mat_SeqAIJCUSPARSE* cusp = (Mat_SeqAIJCUSPARSE*)mat->spptr;
4378 
4379   PetscFunctionBegin;
4380   if (!cusp) PetscFunctionReturn(0);
4381   delete cusp->cooPerm;
4382   delete cusp->cooPerm_a;
4383   cusp->cooPerm = NULL;
4384   cusp->cooPerm_a = NULL;
4385   if (cusp->use_extended_coo) {
4386     PetscCallCUDA(cudaFree(cusp->jmap_d));
4387     PetscCallCUDA(cudaFree(cusp->perm_d));
4388   }
4389   cusp->use_extended_coo = PETSC_FALSE;
4390   PetscFunctionReturn(0);
4391 }
4392 
4393 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
4394 {
4395   PetscFunctionBegin;
4396   if (*cusparsestruct) {
4397     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format));
4398     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format));
4399     delete (*cusparsestruct)->workVector;
4400     delete (*cusparsestruct)->rowoffsets_gpu;
4401     delete (*cusparsestruct)->cooPerm;
4402     delete (*cusparsestruct)->cooPerm_a;
4403     delete (*cusparsestruct)->csr2csc_i;
4404     if ((*cusparsestruct)->handle) PetscCallCUSPARSE(cusparseDestroy((*cusparsestruct)->handle));
4405     if ((*cusparsestruct)->jmap_d) PetscCallCUDA(cudaFree((*cusparsestruct)->jmap_d));
4406     if ((*cusparsestruct)->perm_d) PetscCallCUDA(cudaFree((*cusparsestruct)->perm_d));
4407     PetscCall(PetscFree(*cusparsestruct));
4408   }
4409   PetscFunctionReturn(0);
4410 }
4411 
4412 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
4413 {
4414   PetscFunctionBegin;
4415   if (*mat) {
4416     delete (*mat)->values;
4417     delete (*mat)->column_indices;
4418     delete (*mat)->row_offsets;
4419     delete *mat;
4420     *mat = 0;
4421   }
4422   PetscFunctionReturn(0);
4423 }
4424 
4425 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
4426 {
4427   PetscFunctionBegin;
4428   if (*trifactor) {
4429     if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr));
4430     if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparseDestroyCsrsvInfo((*trifactor)->solveInfo));
4431     PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat));
4432     if ((*trifactor)->solveBuffer)   PetscCallCUDA(cudaFree((*trifactor)->solveBuffer));
4433     if ((*trifactor)->AA_h)   PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h));
4434    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4435     if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer));
4436    #endif
4437     PetscCall(PetscFree(*trifactor));
4438   }
4439   PetscFunctionReturn(0);
4440 }
4441 
4442 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
4443 {
4444   CsrMatrix        *mat;
4445 
4446   PetscFunctionBegin;
4447   if (*matstruct) {
4448     if ((*matstruct)->mat) {
4449       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
4450        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4451         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
4452        #else
4453         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
4454         PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat));
4455        #endif
4456       } else {
4457         mat = (CsrMatrix*)(*matstruct)->mat;
4458         CsrMatrix_Destroy(&mat);
4459       }
4460     }
4461     if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr));
4462     delete (*matstruct)->cprowIndices;
4463     if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one));
4464     if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero));
4465     if ((*matstruct)->beta_one)  PetscCallCUDA(cudaFree((*matstruct)->beta_one));
4466 
4467    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4468     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
4469     if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr));
4470     for (int i=0; i<3; i++) {
4471       if (mdata->cuSpMV[i].initialized) {
4472         PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer));
4473         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr));
4474         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr));
4475       }
4476     }
4477    #endif
4478     delete *matstruct;
4479     *matstruct = NULL;
4480   }
4481   PetscFunctionReturn(0);
4482 }
4483 
4484 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p* trifactors)
4485 {
4486   Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors;
4487 
4488   PetscFunctionBegin;
4489   if (fs) {
4490     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtr));
4491     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtr));
4492     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose));
4493     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose));
4494     delete fs->rpermIndices;
4495     delete fs->cpermIndices;
4496     delete fs->workVector;
4497     fs->rpermIndices = NULL;
4498     fs->cpermIndices = NULL;
4499     fs->workVector = NULL;
4500     if (fs->a_band_d)   PetscCallCUDA(cudaFree(fs->a_band_d));
4501     if (fs->i_band_d)   PetscCallCUDA(cudaFree(fs->i_band_d));
4502     fs->init_dev_prop = PETSC_FALSE;
4503    #if CUSPARSE_VERSION >= 11500
4504     PetscCallCUDA(cudaFree(fs->csrRowPtr));
4505     PetscCallCUDA(cudaFree(fs->csrColIdx));
4506     PetscCallCUDA(cudaFree(fs->csrVal));
4507     PetscCallCUDA(cudaFree(fs->X));
4508     PetscCallCUDA(cudaFree(fs->Y));
4509     // PetscCallCUDA(cudaFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */
4510     PetscCallCUDA(cudaFree(fs->spsvBuffer_L));
4511     PetscCallCUDA(cudaFree(fs->spsvBuffer_U));
4512     PetscCallCUDA(cudaFree(fs->spsvBuffer_Lt));
4513     PetscCallCUDA(cudaFree(fs->spsvBuffer_Ut));
4514     PetscCallCUSPARSE(cusparseDestroyMatDescr(fs->matDescr_M));
4515     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_L));
4516     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_U));
4517     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_L));
4518     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Lt));
4519     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_U));
4520     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Ut));
4521     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_X));
4522     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_Y));
4523     PetscCallCUSPARSE(cusparseDestroyCsrilu02Info(fs->ilu0Info_M));
4524     PetscCallCUSPARSE(cusparseDestroyCsric02Info(fs->ic0Info_M));
4525 
4526     fs->createdTransposeSpSVDescr    = PETSC_FALSE;
4527     fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
4528    #endif
4529   }
4530   PetscFunctionReturn(0);
4531 }
4532 
4533 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
4534 {
4535   cusparseHandle_t handle;
4536 
4537   PetscFunctionBegin;
4538   if (*trifactors) {
4539     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors));
4540     if (handle = (*trifactors)->handle) {
4541       PetscCallCUSPARSE(cusparseDestroy(handle));
4542     }
4543     PetscCall(PetscFree(*trifactors));
4544   }
4545   PetscFunctionReturn(0);
4546 }
4547 
4548 struct IJCompare
4549 {
4550   __host__ __device__
4551   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
4552   {
4553     if (t1.get<0>() < t2.get<0>()) return true;
4554     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
4555     return false;
4556   }
4557 };
4558 
4559 struct IJEqual
4560 {
4561   __host__ __device__
4562   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
4563   {
4564     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
4565     return true;
4566   }
4567 };
4568 
4569 struct IJDiff
4570 {
4571   __host__ __device__
4572   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
4573   {
4574     return t1 == t2 ? 0 : 1;
4575   }
4576 };
4577 
4578 struct IJSum
4579 {
4580   __host__ __device__
4581   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
4582   {
4583     return t1||t2;
4584   }
4585 };
4586 
4587 #include <thrust/iterator/discard_iterator.h>
4588 /* Associated with MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic() */
4589 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode)
4590 {
4591   Mat_SeqAIJCUSPARSE                    *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4592   Mat_SeqAIJ                            *a = (Mat_SeqAIJ*)A->data;
4593   THRUSTARRAY                           *cooPerm_v = NULL;
4594   thrust::device_ptr<const PetscScalar> d_v;
4595   CsrMatrix                             *matrix;
4596   PetscInt                              n;
4597 
4598   PetscFunctionBegin;
4599   PetscCheck(cusp,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct");
4600   PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix");
4601   if (!cusp->cooPerm) {
4602     PetscCall(MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY));
4603     PetscCall(MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY));
4604     PetscFunctionReturn(0);
4605   }
4606   matrix = (CsrMatrix*)cusp->mat->mat;
4607   PetscCheck(matrix->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4608   if (!v) {
4609     if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
4610     goto finalize;
4611   }
4612   n = cusp->cooPerm->size();
4613   if (isCudaMem(v)) {
4614     d_v = thrust::device_pointer_cast(v);
4615   } else {
4616     cooPerm_v = new THRUSTARRAY(n);
4617     cooPerm_v->assign(v,v+n);
4618     d_v = cooPerm_v->data();
4619     PetscCall(PetscLogCpuToGpu(n*sizeof(PetscScalar)));
4620   }
4621   PetscCall(PetscLogGpuTimeBegin());
4622   if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
4623     if (cusp->cooPerm_a) { /* there are repeated entries in d_v[], and we need to add these them */
4624       THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
4625       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
4626       /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output)
4627         cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[].
4628         cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero.
4629       */
4630       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
4631       thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>());
4632       delete cooPerm_w;
4633     } else {
4634       /* all nonzeros in d_v[] are unique entries */
4635       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
4636                                                                 matrix->values->begin()));
4637       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
4638                                                                 matrix->values->end()));
4639       thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]]  */
4640     }
4641   } else {
4642     if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
4643       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
4644       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
4645     } else {
4646       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
4647                                                                 matrix->values->begin()));
4648       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
4649                                                                 matrix->values->end()));
4650       thrust::for_each(zibit,zieit,VecCUDAEquals());
4651     }
4652   }
4653   PetscCall(PetscLogGpuTimeEnd());
4654 finalize:
4655   delete cooPerm_v;
4656   A->offloadmask = PETSC_OFFLOAD_GPU;
4657   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4658   /* shorter version of MatAssemblyEnd_SeqAIJ */
4659   PetscCall(PetscInfo(A,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",A->rmap->n,A->cmap->n,a->nz));
4660   PetscCall(PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n"));
4661   PetscCall(PetscInfo(A,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",a->rmax));
4662   a->reallocs         = 0;
4663   A->info.mallocs    += 0;
4664   A->info.nz_unneeded = 0;
4665   A->assembled = A->was_assembled = PETSC_TRUE;
4666   A->num_ass++;
4667   PetscFunctionReturn(0);
4668 }
4669 
4670 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
4671 {
4672   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4673 
4674   PetscFunctionBegin;
4675   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4676   if (!cusp) PetscFunctionReturn(0);
4677   if (destroy) {
4678     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format));
4679     delete cusp->csr2csc_i;
4680     cusp->csr2csc_i = NULL;
4681   }
4682   A->transupdated = PETSC_FALSE;
4683   PetscFunctionReturn(0);
4684 }
4685 
4686 #include <thrust/binary_search.h>
4687 /* 'Basic' means it only works when coo_i[] and coo_j[] do not contain negative indices */
4688 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(Mat A, PetscCount n, const PetscInt coo_i[], const PetscInt coo_j[])
4689 {
4690   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4691   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
4692   PetscInt           cooPerm_n, nzr = 0;
4693 
4694   PetscFunctionBegin;
4695   PetscCall(PetscLayoutSetUp(A->rmap));
4696   PetscCall(PetscLayoutSetUp(A->cmap));
4697   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
4698   if (n != cooPerm_n) {
4699     delete cusp->cooPerm;
4700     delete cusp->cooPerm_a;
4701     cusp->cooPerm = NULL;
4702     cusp->cooPerm_a = NULL;
4703   }
4704   if (n) {
4705     THRUSTINTARRAY d_i(n);
4706     THRUSTINTARRAY d_j(n);
4707     THRUSTINTARRAY ii(A->rmap->n);
4708 
4709     if (!cusp->cooPerm)   { cusp->cooPerm   = new THRUSTINTARRAY(n); }
4710     if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); }
4711 
4712     PetscCall(PetscLogCpuToGpu(2.*n*sizeof(PetscInt)));
4713     d_i.assign(coo_i,coo_i+n);
4714     d_j.assign(coo_j,coo_j+n);
4715 
4716     /* Ex.
4717       n = 6
4718       coo_i = [3,3,1,4,1,4]
4719       coo_j = [3,2,2,5,2,6]
4720     */
4721     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin()));
4722     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end()));
4723 
4724     PetscCall(PetscLogGpuTimeBegin());
4725     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
4726     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */
4727     *cusp->cooPerm_a = d_i; /* copy the sorted array */
4728     THRUSTINTARRAY w = d_j;
4729 
4730     /*
4731       d_i     = [1,1,3,3,4,4]
4732       d_j     = [2,2,2,3,5,6]
4733       cooPerm = [2,4,1,0,3,5]
4734     */
4735     auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */
4736 
4737     /*
4738       d_i     = [1,3,3,4,4,x]
4739                             ^ekey
4740       d_j     = [2,2,3,5,6,x]
4741                            ^nekye
4742     */
4743     if (nekey == ekey) { /* all entries are unique */
4744       delete cusp->cooPerm_a;
4745       cusp->cooPerm_a = NULL;
4746     } else { /* Stefano: I couldn't come up with a more elegant algorithm */
4747       /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */
4748       adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/
4749       adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff());                                              /* w:         [2,2,2,3,5,6] => [2,0,0,1,1,1]*/
4750       (*cusp->cooPerm_a)[0] = 0; /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */
4751       w[0] = 0;
4752       thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); /* cooPerm_a =          [0,0,1,1,1,1]*/
4753       thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/
4754     }
4755     thrust::counting_iterator<PetscInt> search_begin(0);
4756     thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */
4757                         search_begin, search_begin + A->rmap->n,  /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */
4758                         ii.begin()); /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */
4759     PetscCall(PetscLogGpuTimeEnd());
4760 
4761     PetscCall(MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i));
4762     a->singlemalloc = PETSC_FALSE;
4763     a->free_a       = PETSC_TRUE;
4764     a->free_ij      = PETSC_TRUE;
4765     PetscCall(PetscMalloc1(A->rmap->n+1,&a->i));
4766     a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */
4767     PetscCallCUDA(cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost));
4768     a->nz = a->maxnz = a->i[A->rmap->n];
4769     a->rmax = 0;
4770     PetscCall(PetscMalloc1(a->nz,&a->a));
4771     PetscCall(PetscMalloc1(a->nz,&a->j));
4772     PetscCallCUDA(cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost));
4773     if (!a->ilen) PetscCall(PetscMalloc1(A->rmap->n,&a->ilen));
4774     if (!a->imax) PetscCall(PetscMalloc1(A->rmap->n,&a->imax));
4775     for (PetscInt i = 0; i < A->rmap->n; i++) {
4776       const PetscInt nnzr = a->i[i+1] - a->i[i];
4777       nzr += (PetscInt)!!(nnzr);
4778       a->ilen[i] = a->imax[i] = nnzr;
4779       a->rmax = PetscMax(a->rmax,nnzr);
4780     }
4781     a->nonzerorowcnt = nzr;
4782     A->preallocated = PETSC_TRUE;
4783     PetscCall(PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt)));
4784     PetscCall(MatMarkDiagonal_SeqAIJ(A));
4785   } else {
4786     PetscCall(MatSeqAIJSetPreallocation(A,0,NULL));
4787   }
4788   PetscCall(MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE));
4789 
4790   /* We want to allocate the CUSPARSE struct for matvec now.
4791      The code is so convoluted now that I prefer to copy zeros */
4792   PetscCall(PetscArrayzero(a->a,a->nz));
4793   PetscCall(MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6));
4794   A->offloadmask = PETSC_OFFLOAD_CPU;
4795   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4796   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE));
4797   PetscFunctionReturn(0);
4798 }
4799 
4800 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[])
4801 {
4802   Mat_SeqAIJ         *seq;
4803   Mat_SeqAIJCUSPARSE *dev;
4804   PetscBool          coo_basic = PETSC_TRUE;
4805   PetscMemType       mtype = PETSC_MEMTYPE_DEVICE;
4806 
4807   PetscFunctionBegin;
4808   PetscCall(MatResetPreallocationCOO_SeqAIJ(mat));
4809   PetscCall(MatResetPreallocationCOO_SeqAIJCUSPARSE(mat));
4810   if (coo_i) {
4811     PetscCall(PetscGetMemType(coo_i,&mtype));
4812     if (PetscMemTypeHost(mtype)) {
4813       for (PetscCount k=0; k<coo_n; k++) {
4814         if (coo_i[k] < 0 || coo_j[k] < 0) {coo_basic = PETSC_FALSE; break;}
4815       }
4816     }
4817   }
4818 
4819   if (coo_basic) { /* i,j are on device or do not contain negative indices */
4820     PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(mat,coo_n,coo_i,coo_j));
4821   } else {
4822     PetscCall(MatSetPreallocationCOO_SeqAIJ(mat,coo_n,coo_i,coo_j));
4823     mat->offloadmask = PETSC_OFFLOAD_CPU;
4824     PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat));
4825     seq  = static_cast<Mat_SeqAIJ*>(mat->data);
4826     dev  = static_cast<Mat_SeqAIJCUSPARSE*>(mat->spptr);
4827     PetscCallCUDA(cudaMalloc((void**)&dev->jmap_d,(seq->nz+1)*sizeof(PetscCount)));
4828     PetscCallCUDA(cudaMemcpy(dev->jmap_d,seq->jmap,(seq->nz+1)*sizeof(PetscCount),cudaMemcpyHostToDevice));
4829     PetscCallCUDA(cudaMalloc((void**)&dev->perm_d,seq->Atot*sizeof(PetscCount)));
4830     PetscCallCUDA(cudaMemcpy(dev->perm_d,seq->perm,seq->Atot*sizeof(PetscCount),cudaMemcpyHostToDevice));
4831     dev->use_extended_coo = PETSC_TRUE;
4832   }
4833   PetscFunctionReturn(0);
4834 }
4835 
4836 __global__ static void MatAddCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount jmap[],const PetscCount perm[],InsertMode imode,PetscScalar a[])
4837 {
4838   PetscCount        i = blockIdx.x*blockDim.x + threadIdx.x;
4839   const PetscCount  grid_size = gridDim.x * blockDim.x;
4840   for (; i<nnz; i+= grid_size) {
4841     PetscScalar sum = 0.0;
4842     for (PetscCount k=jmap[i]; k<jmap[i+1]; k++) sum += kv[perm[k]];
4843     a[i] = (imode == INSERT_VALUES? 0.0 : a[i]) + sum;
4844   }
4845 }
4846 
4847 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
4848 {
4849   Mat_SeqAIJ          *seq = (Mat_SeqAIJ*)A->data;
4850   Mat_SeqAIJCUSPARSE  *dev = (Mat_SeqAIJCUSPARSE*)A->spptr;
4851   PetscCount          Annz = seq->nz;
4852   PetscMemType        memtype;
4853   const PetscScalar   *v1 = v;
4854   PetscScalar         *Aa;
4855 
4856   PetscFunctionBegin;
4857   if (dev->use_extended_coo) {
4858     PetscCall(PetscGetMemType(v,&memtype));
4859     if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
4860       PetscCallCUDA(cudaMalloc((void**)&v1,seq->coo_n*sizeof(PetscScalar)));
4861       PetscCallCUDA(cudaMemcpy((void*)v1,v,seq->coo_n*sizeof(PetscScalar),cudaMemcpyHostToDevice));
4862     }
4863 
4864     if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A,&Aa));
4865     else PetscCall(MatSeqAIJCUSPARSEGetArray(A,&Aa));
4866 
4867     if (Annz) {
4868       MatAddCOOValues<<<(Annz+255)/256,256>>>(v1,Annz,dev->jmap_d,dev->perm_d,imode,Aa);
4869       PetscCallCUDA(cudaPeekAtLastError());
4870     }
4871 
4872     if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A,&Aa));
4873     else PetscCall(MatSeqAIJCUSPARSERestoreArray(A,&Aa));
4874 
4875     if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void*)v1));
4876   } else {
4877     PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(A,v,imode));
4878   }
4879   PetscFunctionReturn(0);
4880 }
4881 
4882 /*@C
4883     MatSeqAIJCUSPARSEGetIJ - returns the device row storage i and j indices for MATSEQAIJCUSPARSE matrices.
4884 
4885    Not collective
4886 
4887     Input Parameters:
4888 +   A - the matrix
4889 -   compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form
4890 
4891     Output Parameters:
4892 +   ia - the CSR row pointers
4893 -   ja - the CSR column indices
4894 
4895     Level: developer
4896 
4897     Notes:
4898       When compressed is true, the CSR structure does not contain empty rows
4899 
4900 .seealso: `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()`
4901 @*/
4902 PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int** i, const int **j)
4903 {
4904   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4905   CsrMatrix          *csr;
4906   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
4907 
4908   PetscFunctionBegin;
4909   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4910   if (!i || !j) PetscFunctionReturn(0);
4911   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4912   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4913   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4914   PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4915   csr = (CsrMatrix*)cusp->mat->mat;
4916   if (i) {
4917     if (!compressed && a->compressedrow.use) { /* need full row offset */
4918       if (!cusp->rowoffsets_gpu) {
4919         cusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
4920         cusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
4921         PetscCall(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt)));
4922       }
4923       *i = cusp->rowoffsets_gpu->data().get();
4924     } else *i = csr->row_offsets->data().get();
4925   }
4926   if (j) *j = csr->column_indices->data().get();
4927   PetscFunctionReturn(0);
4928 }
4929 
4930 /*@C
4931     MatSeqAIJCUSPARSERestoreIJ - restore the device row storage i and j indices obtained with MatSeqAIJCUSPARSEGetIJ()
4932 
4933    Not collective
4934 
4935     Input Parameters:
4936 +   A - the matrix
4937 -   compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form
4938 
4939     Output Parameters:
4940 +   ia - the CSR row pointers
4941 -   ja - the CSR column indices
4942 
4943     Level: developer
4944 
4945 .seealso: `MatSeqAIJCUSPARSEGetIJ()`
4946 @*/
4947 PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int** i, const int **j)
4948 {
4949   PetscFunctionBegin;
4950   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4951   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4952   if (i) *i = NULL;
4953   if (j) *j = NULL;
4954   PetscFunctionReturn(0);
4955 }
4956 
4957 /*@C
4958    MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
4959 
4960    Not Collective
4961 
4962    Input Parameter:
4963 .   A - a MATSEQAIJCUSPARSE matrix
4964 
4965    Output Parameter:
4966 .   a - pointer to the device data
4967 
4968    Level: developer
4969 
4970    Notes: may trigger host-device copies if up-to-date matrix data is on host
4971 
4972 .seealso: `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()`
4973 @*/
4974 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a)
4975 {
4976   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4977   CsrMatrix          *csr;
4978 
4979   PetscFunctionBegin;
4980   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4981   PetscValidPointer(a,2);
4982   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4983   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4984   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4985   PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4986   csr = (CsrMatrix*)cusp->mat->mat;
4987   PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4988   *a = csr->values->data().get();
4989   PetscFunctionReturn(0);
4990 }
4991 
4992 /*@C
4993    MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from MatSeqAIJCUSPARSEGetArrayRead()
4994 
4995    Not Collective
4996 
4997    Input Parameter:
4998 .   A - a MATSEQAIJCUSPARSE matrix
4999 
5000    Output Parameter:
5001 .   a - pointer to the device data
5002 
5003    Level: developer
5004 
5005 .seealso: `MatSeqAIJCUSPARSEGetArrayRead()`
5006 @*/
5007 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a)
5008 {
5009   PetscFunctionBegin;
5010   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
5011   PetscValidPointer(a,2);
5012   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
5013   *a = NULL;
5014   PetscFunctionReturn(0);
5015 }
5016 
5017 /*@C
5018    MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
5019 
5020    Not Collective
5021 
5022    Input Parameter:
5023 .   A - a MATSEQAIJCUSPARSE matrix
5024 
5025    Output Parameter:
5026 .   a - pointer to the device data
5027 
5028    Level: developer
5029 
5030    Notes: may trigger host-device copies if up-to-date matrix data is on host
5031 
5032 .seealso: `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()`
5033 @*/
5034 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a)
5035 {
5036   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
5037   CsrMatrix          *csr;
5038 
5039   PetscFunctionBegin;
5040   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
5041   PetscValidPointer(a,2);
5042   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
5043   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
5044   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
5045   PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
5046   csr = (CsrMatrix*)cusp->mat->mat;
5047   PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
5048   *a = csr->values->data().get();
5049   A->offloadmask = PETSC_OFFLOAD_GPU;
5050   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE));
5051   PetscFunctionReturn(0);
5052 }
5053 /*@C
5054    MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from MatSeqAIJCUSPARSEGetArray()
5055 
5056    Not Collective
5057 
5058    Input Parameter:
5059 .   A - a MATSEQAIJCUSPARSE matrix
5060 
5061    Output Parameter:
5062 .   a - pointer to the device data
5063 
5064    Level: developer
5065 
5066 .seealso: `MatSeqAIJCUSPARSEGetArray()`
5067 @*/
5068 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a)
5069 {
5070   PetscFunctionBegin;
5071   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
5072   PetscValidPointer(a,2);
5073   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
5074   PetscCall(MatSeqAIJInvalidateDiagonal(A));
5075   PetscCall(PetscObjectStateIncrease((PetscObject)A));
5076   *a = NULL;
5077   PetscFunctionReturn(0);
5078 }
5079 
5080 /*@C
5081    MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
5082 
5083    Not Collective
5084 
5085    Input Parameter:
5086 .   A - a MATSEQAIJCUSPARSE matrix
5087 
5088    Output Parameter:
5089 .   a - pointer to the device data
5090 
5091    Level: developer
5092 
5093    Notes: does not trigger host-device copies and flags data validity on the GPU
5094 
5095 .seealso: `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()`
5096 @*/
5097 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a)
5098 {
5099   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
5100   CsrMatrix          *csr;
5101 
5102   PetscFunctionBegin;
5103   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
5104   PetscValidPointer(a,2);
5105   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
5106   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
5107   PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
5108   csr = (CsrMatrix*)cusp->mat->mat;
5109   PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
5110   *a = csr->values->data().get();
5111   A->offloadmask = PETSC_OFFLOAD_GPU;
5112   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE));
5113   PetscFunctionReturn(0);
5114 }
5115 
5116 /*@C
5117    MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from MatSeqAIJCUSPARSEGetArrayWrite()
5118 
5119    Not Collective
5120 
5121    Input Parameter:
5122 .   A - a MATSEQAIJCUSPARSE matrix
5123 
5124    Output Parameter:
5125 .   a - pointer to the device data
5126 
5127    Level: developer
5128 
5129 .seealso: `MatSeqAIJCUSPARSEGetArrayWrite()`
5130 @*/
5131 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a)
5132 {
5133   PetscFunctionBegin;
5134   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
5135   PetscValidPointer(a,2);
5136   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
5137   PetscCall(MatSeqAIJInvalidateDiagonal(A));
5138   PetscCall(PetscObjectStateIncrease((PetscObject)A));
5139   *a = NULL;
5140   PetscFunctionReturn(0);
5141 }
5142 
5143 struct IJCompare4
5144 {
5145   __host__ __device__
5146   inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
5147   {
5148     if (t1.get<0>() < t2.get<0>()) return true;
5149     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
5150     return false;
5151   }
5152 };
5153 
5154 struct Shift
5155 {
5156   int _shift;
5157 
5158   Shift(int shift) : _shift(shift) {}
5159   __host__ __device__
5160   inline int operator() (const int &c)
5161   {
5162     return c + _shift;
5163   }
5164 };
5165 
5166 /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */
5167 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
5168 {
5169   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c;
5170   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp;
5171   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
5172   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
5173   PetscInt                     Annz,Bnnz;
5174   cusparseStatus_t             stat;
5175   PetscInt                     i,m,n,zero = 0;
5176 
5177   PetscFunctionBegin;
5178   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
5179   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
5180   PetscValidPointer(C,4);
5181   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
5182   PetscCheckTypeName(B,MATSEQAIJCUSPARSE);
5183   PetscCheck(A->rmap->n == B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n);
5184   PetscCheck(reuse != MAT_INPLACE_MATRIX,PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
5185   PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
5186   PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
5187   if (reuse == MAT_INITIAL_MATRIX) {
5188     m     = A->rmap->n;
5189     n     = A->cmap->n + B->cmap->n;
5190     PetscCall(MatCreate(PETSC_COMM_SELF,C));
5191     PetscCall(MatSetSizes(*C,m,n,m,n));
5192     PetscCall(MatSetType(*C,MATSEQAIJCUSPARSE));
5193     c     = (Mat_SeqAIJ*)(*C)->data;
5194     Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
5195     Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
5196     Ccsr  = new CsrMatrix;
5197     Cmat->cprowIndices      = NULL;
5198     c->compressedrow.use    = PETSC_FALSE;
5199     c->compressedrow.nrows  = 0;
5200     c->compressedrow.i      = NULL;
5201     c->compressedrow.rindex = NULL;
5202     Ccusp->workVector       = NULL;
5203     Ccusp->nrows    = m;
5204     Ccusp->mat      = Cmat;
5205     Ccusp->mat->mat = Ccsr;
5206     Ccsr->num_rows  = m;
5207     Ccsr->num_cols  = n;
5208     PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
5209     PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
5210     PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
5211     PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar)));
5212     PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar)));
5213     PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar)));
5214     PetscCallCUDA(cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
5215     PetscCallCUDA(cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice));
5216     PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
5217     PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
5218     PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
5219     PetscCheck(Acusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
5220     PetscCheck(Bcusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
5221 
5222     Acsr = (CsrMatrix*)Acusp->mat->mat;
5223     Bcsr = (CsrMatrix*)Bcusp->mat->mat;
5224     Annz = (PetscInt)Acsr->column_indices->size();
5225     Bnnz = (PetscInt)Bcsr->column_indices->size();
5226     c->nz = Annz + Bnnz;
5227     Ccsr->row_offsets = new THRUSTINTARRAY32(m+1);
5228     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
5229     Ccsr->values = new THRUSTARRAY(c->nz);
5230     Ccsr->num_entries = c->nz;
5231     Ccusp->cooPerm = new THRUSTINTARRAY(c->nz);
5232     if (c->nz) {
5233       auto Acoo = new THRUSTINTARRAY32(Annz);
5234       auto Bcoo = new THRUSTINTARRAY32(Bnnz);
5235       auto Ccoo = new THRUSTINTARRAY32(c->nz);
5236       THRUSTINTARRAY32 *Aroff,*Broff;
5237 
5238       if (a->compressedrow.use) { /* need full row offset */
5239         if (!Acusp->rowoffsets_gpu) {
5240           Acusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
5241           Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
5242           PetscCall(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt)));
5243         }
5244         Aroff = Acusp->rowoffsets_gpu;
5245       } else Aroff = Acsr->row_offsets;
5246       if (b->compressedrow.use) { /* need full row offset */
5247         if (!Bcusp->rowoffsets_gpu) {
5248           Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
5249           Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
5250           PetscCall(PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt)));
5251         }
5252         Broff = Bcusp->rowoffsets_gpu;
5253       } else Broff = Bcsr->row_offsets;
5254       PetscCall(PetscLogGpuTimeBegin());
5255       stat = cusparseXcsr2coo(Acusp->handle,
5256                               Aroff->data().get(),
5257                               Annz,
5258                               m,
5259                               Acoo->data().get(),
5260                               CUSPARSE_INDEX_BASE_ZERO);PetscCallCUSPARSE(stat);
5261       stat = cusparseXcsr2coo(Bcusp->handle,
5262                               Broff->data().get(),
5263                               Bnnz,
5264                               m,
5265                               Bcoo->data().get(),
5266                               CUSPARSE_INDEX_BASE_ZERO);PetscCallCUSPARSE(stat);
5267       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
5268       auto Aperm = thrust::make_constant_iterator(1);
5269       auto Bperm = thrust::make_constant_iterator(0);
5270 #if PETSC_PKG_CUDA_VERSION_GE(10,0,0)
5271       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n));
5272       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n));
5273 #else
5274       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
5275       auto Bcib = Bcsr->column_indices->begin();
5276       auto Bcie = Bcsr->column_indices->end();
5277       thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n));
5278 #endif
5279       auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz);
5280       auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm));
5281       auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm));
5282       auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm));
5283       auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm));
5284       auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin()));
5285       auto p1 = Ccusp->cooPerm->begin();
5286       auto p2 = Ccusp->cooPerm->begin();
5287       thrust::advance(p2,Annz);
5288       PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4()));
5289 #if PETSC_PKG_CUDA_VERSION_LT(10,0,0)
5290       thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n));
5291 #endif
5292       auto cci = thrust::make_counting_iterator(zero);
5293       auto cce = thrust::make_counting_iterator(c->nz);
5294 #if 0 //Errors on SUMMIT cuda 11.1.0
5295       PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
5296 #else
5297       auto pred = thrust::identity<int>();
5298       PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred));
5299       PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred));
5300 #endif
5301       stat = cusparseXcoo2csr(Ccusp->handle,
5302                               Ccoo->data().get(),
5303                               c->nz,
5304                               m,
5305                               Ccsr->row_offsets->data().get(),
5306                               CUSPARSE_INDEX_BASE_ZERO);PetscCallCUSPARSE(stat);
5307       PetscCall(PetscLogGpuTimeEnd());
5308       delete wPerm;
5309       delete Acoo;
5310       delete Bcoo;
5311       delete Ccoo;
5312 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
5313       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries,
5314                                Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(),
5315                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
5316                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat);
5317 #endif
5318       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
5319         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
5320         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
5321         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
5322         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
5323         CsrMatrix *CcsrT = new CsrMatrix;
5324         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
5325         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
5326 
5327         (*C)->form_explicit_transpose = PETSC_TRUE;
5328         (*C)->transupdated = PETSC_TRUE;
5329         Ccusp->rowoffsets_gpu = NULL;
5330         CmatT->cprowIndices = NULL;
5331         CmatT->mat = CcsrT;
5332         CcsrT->num_rows = n;
5333         CcsrT->num_cols = m;
5334         CcsrT->num_entries = c->nz;
5335 
5336         CcsrT->row_offsets = new THRUSTINTARRAY32(n+1);
5337         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
5338         CcsrT->values = new THRUSTARRAY(c->nz);
5339 
5340         PetscCall(PetscLogGpuTimeBegin());
5341         auto rT = CcsrT->row_offsets->begin();
5342         if (AT) {
5343           rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT);
5344           thrust::advance(rT,-1);
5345         }
5346         if (BT) {
5347           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz));
5348           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz));
5349           thrust::copy(titb,tite,rT);
5350         }
5351         auto cT = CcsrT->column_indices->begin();
5352         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT);
5353         if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT);
5354         auto vT = CcsrT->values->begin();
5355         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
5356         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
5357         PetscCall(PetscLogGpuTimeEnd());
5358 
5359         PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr));
5360         PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO));
5361         PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
5362         PetscCallCUDA(cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar)));
5363         PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar)));
5364         PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar)));
5365         PetscCallCUDA(cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
5366         PetscCallCUDA(cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice));
5367         PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
5368 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
5369         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries,
5370                                  CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(),
5371                                  CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
5372                                  CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat);
5373 #endif
5374         Ccusp->matTranspose = CmatT;
5375       }
5376     }
5377 
5378     c->singlemalloc = PETSC_FALSE;
5379     c->free_a       = PETSC_TRUE;
5380     c->free_ij      = PETSC_TRUE;
5381     PetscCall(PetscMalloc1(m+1,&c->i));
5382     PetscCall(PetscMalloc1(c->nz,&c->j));
5383     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
5384       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
5385       THRUSTINTARRAY jj(Ccsr->column_indices->size());
5386       ii   = *Ccsr->row_offsets;
5387       jj   = *Ccsr->column_indices;
5388       PetscCallCUDA(cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
5389       PetscCallCUDA(cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
5390     } else {
5391       PetscCallCUDA(cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
5392       PetscCallCUDA(cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
5393     }
5394     PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt)));
5395     PetscCall(PetscMalloc1(m,&c->ilen));
5396     PetscCall(PetscMalloc1(m,&c->imax));
5397     c->maxnz = c->nz;
5398     c->nonzerorowcnt = 0;
5399     c->rmax = 0;
5400     for (i = 0; i < m; i++) {
5401       const PetscInt nn = c->i[i+1] - c->i[i];
5402       c->ilen[i] = c->imax[i] = nn;
5403       c->nonzerorowcnt += (PetscInt)!!nn;
5404       c->rmax = PetscMax(c->rmax,nn);
5405     }
5406     PetscCall(MatMarkDiagonal_SeqAIJ(*C));
5407     PetscCall(PetscMalloc1(c->nz,&c->a));
5408     (*C)->nonzerostate++;
5409     PetscCall(PetscLayoutSetUp((*C)->rmap));
5410     PetscCall(PetscLayoutSetUp((*C)->cmap));
5411     Ccusp->nonzerostate = (*C)->nonzerostate;
5412     (*C)->preallocated  = PETSC_TRUE;
5413   } else {
5414     PetscCheck((*C)->rmap->n == B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,(*C)->rmap->n,B->rmap->n);
5415     c = (Mat_SeqAIJ*)(*C)->data;
5416     if (c->nz) {
5417       Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
5418       PetscCheck(Ccusp->cooPerm,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm");
5419       PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
5420       PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate,PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate");
5421       PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
5422       PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
5423       PetscCheck(Acusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
5424       PetscCheck(Bcusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
5425       Acsr = (CsrMatrix*)Acusp->mat->mat;
5426       Bcsr = (CsrMatrix*)Bcusp->mat->mat;
5427       Ccsr = (CsrMatrix*)Ccusp->mat->mat;
5428       PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %" PetscInt_FMT " != %" PetscInt_FMT,Acsr->num_entries,(PetscInt)Acsr->values->size());
5429       PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %" PetscInt_FMT " != %" PetscInt_FMT,Bcsr->num_entries,(PetscInt)Bcsr->values->size());
5430       PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %" PetscInt_FMT " != %" PetscInt_FMT,Ccsr->num_entries,(PetscInt)Ccsr->values->size());
5431       PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries,PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT,Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries);
5432       PetscCheck(Ccusp->cooPerm->size() == Ccsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %" PetscInt_FMT " != %" PetscInt_FMT,(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size());
5433       auto pmid = Ccusp->cooPerm->begin();
5434       thrust::advance(pmid,Acsr->num_entries);
5435       PetscCall(PetscLogGpuTimeBegin());
5436       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(),
5437                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin())));
5438       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(),
5439                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
5440       thrust::for_each(zibait,zieait,VecCUDAEquals());
5441       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(),
5442                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
5443       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(),
5444                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end())));
5445       thrust::for_each(zibbit,ziebit,VecCUDAEquals());
5446       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE));
5447       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
5448         PetscCheck(Ccusp->matTranspose,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
5449         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
5450         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
5451         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
5452         CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat;
5453         auto vT = CcsrT->values->begin();
5454         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
5455         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
5456         (*C)->transupdated = PETSC_TRUE;
5457       }
5458       PetscCall(PetscLogGpuTimeEnd());
5459     }
5460   }
5461   PetscCall(PetscObjectStateIncrease((PetscObject)*C));
5462   (*C)->assembled     = PETSC_TRUE;
5463   (*C)->was_assembled = PETSC_FALSE;
5464   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
5465   PetscFunctionReturn(0);
5466 }
5467 
5468 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
5469 {
5470   bool              dmem;
5471   const PetscScalar *av;
5472 
5473   PetscFunctionBegin;
5474   dmem = isCudaMem(v);
5475   PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A,&av));
5476   if (n && idx) {
5477     THRUSTINTARRAY widx(n);
5478     widx.assign(idx,idx+n);
5479     PetscCall(PetscLogCpuToGpu(n*sizeof(PetscInt)));
5480 
5481     THRUSTARRAY *w = NULL;
5482     thrust::device_ptr<PetscScalar> dv;
5483     if (dmem) {
5484       dv = thrust::device_pointer_cast(v);
5485     } else {
5486       w = new THRUSTARRAY(n);
5487       dv = w->data();
5488     }
5489     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
5490 
5491     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv));
5492     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n));
5493     thrust::for_each(zibit,zieit,VecCUDAEquals());
5494     if (w) {
5495       PetscCallCUDA(cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost));
5496     }
5497     delete w;
5498   } else {
5499     PetscCallCUDA(cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost));
5500   }
5501   if (!dmem) PetscCall(PetscLogCpuToGpu(n*sizeof(PetscScalar)));
5502   PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A,&av));
5503   PetscFunctionReturn(0);
5504 }
5505