xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision e08b1d6d0faae6eca507e20c9d3498f81719d047)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
7 
8 #include <petscconf.h>
9 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
10 #include <../src/mat/impls/sbaij/seq/sbaij.h>
11 #include <../src/vec/vec/impls/dvecimpl.h>
12 #include <petsc/private/vecimpl.h>
13 #undef VecType
14 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
15 #include <thrust/adjacent_difference.h>
16 #include <thrust/async/for_each.h>
17 #include <thrust/iterator/constant_iterator.h>
18 #include <thrust/remove.h>
19 #include <thrust/sort.h>
20 #include <thrust/unique.h>
21 
22 const char *const MatCUSPARSEStorageFormats[]    = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
23 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
24   /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
25     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
26 
27   typedef enum {
28       CUSPARSE_MV_ALG_DEFAULT = 0,
29       CUSPARSE_COOMV_ALG      = 1,
30       CUSPARSE_CSRMV_ALG1     = 2,
31       CUSPARSE_CSRMV_ALG2     = 3
32   } cusparseSpMVAlg_t;
33 
34   typedef enum {
35       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
36       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
37       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
38       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
39       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
40       CUSPARSE_SPMM_ALG_DEFAULT = 0,
41       CUSPARSE_SPMM_COO_ALG1    = 1,
42       CUSPARSE_SPMM_COO_ALG2    = 2,
43       CUSPARSE_SPMM_COO_ALG3    = 3,
44       CUSPARSE_SPMM_COO_ALG4    = 5,
45       CUSPARSE_SPMM_CSR_ALG1    = 4,
46       CUSPARSE_SPMM_CSR_ALG2    = 6,
47   } cusparseSpMMAlg_t;
48 
49   typedef enum {
50       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
51       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
52   } cusparseCsr2CscAlg_t;
53   */
54   const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
55   const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
56   const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
57 #endif
58 
59 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
60 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
61 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
62 
63 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
64 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
65 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
66 
67 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
68 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
69 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
70 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
71 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat,PetscOptionItems *PetscOptionsObject);
72 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure);
73 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar);
74 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
75 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
76 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
77 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
78 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
79 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
80 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
81 
82 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
83 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
84 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
85 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
86 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
87 
88 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
89 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool);
90 
91 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]);
92 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscCount,PetscInt[],PetscInt[]);
93 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode);
94 
95 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
96 {
97   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
98 
99   PetscFunctionBegin;
100   switch (op) {
101   case MAT_CUSPARSE_MULT:
102     cusparsestruct->format = format;
103     break;
104   case MAT_CUSPARSE_ALL:
105     cusparsestruct->format = format;
106     break;
107   default:
108     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
109   }
110   PetscFunctionReturn(0);
111 }
112 
113 /*@
114    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
115    operation. Only the MatMult operation can use different GPU storage formats
116    for MPIAIJCUSPARSE matrices.
117    Not Collective
118 
119    Input Parameters:
120 +  A - Matrix of type SEQAIJCUSPARSE
121 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
122 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
123 
124    Output Parameter:
125 
126    Level: intermediate
127 
128 .seealso: `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
129 @*/
130 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
131 {
132   PetscFunctionBegin;
133   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
134   PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));
135   PetscFunctionReturn(0);
136 }
137 
138 PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A,PetscBool use_cpu)
139 {
140   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
141 
142   PetscFunctionBegin;
143   cusparsestruct->use_cpu_solve = use_cpu;
144   PetscFunctionReturn(0);
145 }
146 
147 /*@
148    MatCUSPARSESetUseCPUSolve - Sets use CPU MatSolve.
149 
150    Input Parameters:
151 +  A - Matrix of type SEQAIJCUSPARSE
152 -  use_cpu - set flag for using the built-in CPU MatSolve
153 
154    Output Parameter:
155 
156    Notes:
157    The cuSparse LU solver currently computes the factors with the built-in CPU method
158    and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there.
159    This method to specify if the solve is done on the CPU or GPU (GPU is the default).
160 
161    Level: intermediate
162 
163 .seealso: `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
164 @*/
165 PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A,PetscBool use_cpu)
166 {
167   PetscFunctionBegin;
168   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
169   PetscTryMethod(A,"MatCUSPARSESetUseCPUSolve_C",(Mat,PetscBool),(A,use_cpu));
170   PetscFunctionReturn(0);
171 }
172 
173 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg)
174 {
175   PetscFunctionBegin;
176   switch (op) {
177     case MAT_FORM_EXPLICIT_TRANSPOSE:
178       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
179       if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE));
180       A->form_explicit_transpose = flg;
181       break;
182     default:
183       PetscCall(MatSetOption_SeqAIJ(A,op,flg));
184       break;
185   }
186   PetscFunctionReturn(0);
187 }
188 
189 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A);
190 
191 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
192 {
193   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
194   IS             isrow = b->row,iscol = b->col;
195   PetscBool      row_identity,col_identity;
196   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)B->spptr;
197 
198   PetscFunctionBegin;
199   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
200   PetscCall(MatLUFactorNumeric_SeqAIJ(B,A,info));
201   B->offloadmask = PETSC_OFFLOAD_CPU;
202   /* determine which version of MatSolve needs to be used. */
203   PetscCall(ISIdentity(isrow,&row_identity));
204   PetscCall(ISIdentity(iscol,&col_identity));
205 
206   if (!cusparsestruct->use_cpu_solve) {
207     if (row_identity && col_identity) {
208       B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
209       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
210     } else {
211       B->ops->solve = MatSolve_SeqAIJCUSPARSE;
212       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
213     }
214   }
215   B->ops->matsolve = NULL;
216   B->ops->matsolvetranspose = NULL;
217 
218   /* get the triangular factors */
219   if (!cusparsestruct->use_cpu_solve) {
220     PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B));
221   }
222   PetscFunctionReturn(0);
223 }
224 
225 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A,PetscOptionItems *PetscOptionsObject)
226 {
227   MatCUSPARSEStorageFormat format;
228   PetscBool                flg;
229   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
230 
231   PetscFunctionBegin;
232   PetscOptionsHeadBegin(PetscOptionsObject,"SeqAIJCUSPARSE options");
233   if (A->factortype == MAT_FACTOR_NONE) {
234     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
235                                "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg));
236     if (flg) PetscCall(MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format));
237 
238     PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
239                                "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg));
240     if (flg) PetscCall(MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format));
241     PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve","Use CPU (I)LU solve","MatCUSPARSESetUseCPUSolve",cusparsestruct->use_cpu_solve,&cusparsestruct->use_cpu_solve,&flg));
242     if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A,cusparsestruct->use_cpu_solve));
243 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
244     PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
245                                "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg));
246     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
247 #if CUSPARSE_VERSION > 11301
248     PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
249 #else
250     PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
251 #endif
252     PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
253                                "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg));
254     PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
255 
256     PetscCall(PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
257                                "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg));
258     PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
259    #endif
260   }
261   PetscOptionsHeadEnd();
262   PetscFunctionReturn(0);
263 }
264 
265 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
266 {
267   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
268   PetscInt                          n = A->rmap->n;
269   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
270   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
271   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
272   const MatScalar                   *aa = a->a,*v;
273   PetscInt                          *AiLo, *AjLo;
274   PetscInt                          i,nz, nzLower, offset, rowOffset;
275 
276   PetscFunctionBegin;
277   if (!n) PetscFunctionReturn(0);
278   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
279     try {
280       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
281       nzLower=n+ai[n]-ai[1];
282       if (!loTriFactor) {
283         PetscScalar                       *AALo;
284 
285         PetscCallCUDA(cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar)));
286 
287         /* Allocate Space for the lower triangular matrix */
288         PetscCallCUDA(cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt)));
289         PetscCallCUDA(cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt)));
290 
291         /* Fill the lower triangular matrix */
292         AiLo[0]  = (PetscInt) 0;
293         AiLo[n]  = nzLower;
294         AjLo[0]  = (PetscInt) 0;
295         AALo[0]  = (MatScalar) 1.0;
296         v        = aa;
297         vi       = aj;
298         offset   = 1;
299         rowOffset= 1;
300         for (i=1; i<n; i++) {
301           nz = ai[i+1] - ai[i];
302           /* additional 1 for the term on the diagonal */
303           AiLo[i]    = rowOffset;
304           rowOffset += nz+1;
305 
306           PetscCall(PetscArraycpy(&(AjLo[offset]), vi, nz));
307           PetscCall(PetscArraycpy(&(AALo[offset]), v, nz));
308 
309           offset      += nz;
310           AjLo[offset] = (PetscInt) i;
311           AALo[offset] = (MatScalar) 1.0;
312           offset      += 1;
313 
314           v  += nz;
315           vi += nz;
316         }
317 
318         /* allocate space for the triangular factor information */
319         PetscCall(PetscNew(&loTriFactor));
320         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
321         /* Create the matrix description */
322         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
323         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
324        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
325         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
326        #else
327         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
328        #endif
329         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER));
330         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
331 
332         /* set the operation */
333         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
334 
335         /* set the matrix */
336         loTriFactor->csrMat = new CsrMatrix;
337         loTriFactor->csrMat->num_rows = n;
338         loTriFactor->csrMat->num_cols = n;
339         loTriFactor->csrMat->num_entries = nzLower;
340 
341         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
342         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
343 
344         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
345         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
346 
347         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
348         loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
349 
350         /* Create the solve analysis information */
351         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
352         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
353       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
354         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
355                                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
356                                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
357                                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
358                                                &loTriFactor->solveBufferSize));
359         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize));
360       #endif
361 
362         /* perform the solve analysis */
363         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
364                                          loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
365                                          loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
366                                          loTriFactor->csrMat->column_indices->data().get(),
367                                          #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
368                                          loTriFactor->solveInfo,
369                                          loTriFactor->solvePolicy, loTriFactor->solveBuffer));
370                                          #else
371                                          loTriFactor->solveInfo));
372                                          #endif
373         PetscCallCUDA(WaitForCUDA());
374         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
375 
376         /* assign the pointer */
377         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
378         loTriFactor->AA_h = AALo;
379         PetscCallCUDA(cudaFreeHost(AiLo));
380         PetscCallCUDA(cudaFreeHost(AjLo));
381         PetscCall(PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar)));
382       } else { /* update values only */
383         if (!loTriFactor->AA_h) {
384           PetscCallCUDA(cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar)));
385         }
386         /* Fill the lower triangular matrix */
387         loTriFactor->AA_h[0]  = 1.0;
388         v        = aa;
389         vi       = aj;
390         offset   = 1;
391         for (i=1; i<n; i++) {
392           nz = ai[i+1] - ai[i];
393           PetscCall(PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz));
394           offset      += nz;
395           loTriFactor->AA_h[offset] = 1.0;
396           offset      += 1;
397           v  += nz;
398         }
399         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower);
400         PetscCall(PetscLogCpuToGpu(nzLower*sizeof(PetscScalar)));
401       }
402     } catch(char *ex) {
403       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
404     }
405   }
406   PetscFunctionReturn(0);
407 }
408 
409 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
410 {
411   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
412   PetscInt                          n = A->rmap->n;
413   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
414   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
415   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
416   const MatScalar                   *aa = a->a,*v;
417   PetscInt                          *AiUp, *AjUp;
418   PetscInt                          i,nz, nzUpper, offset;
419 
420   PetscFunctionBegin;
421   if (!n) PetscFunctionReturn(0);
422   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
423     try {
424       /* next, figure out the number of nonzeros in the upper triangular matrix. */
425       nzUpper = adiag[0]-adiag[n];
426       if (!upTriFactor) {
427         PetscScalar *AAUp;
428 
429         PetscCallCUDA(cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar)));
430 
431         /* Allocate Space for the upper triangular matrix */
432         PetscCallCUDA(cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt)));
433         PetscCallCUDA(cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt)));
434 
435         /* Fill the upper triangular matrix */
436         AiUp[0]=(PetscInt) 0;
437         AiUp[n]=nzUpper;
438         offset = nzUpper;
439         for (i=n-1; i>=0; i--) {
440           v  = aa + adiag[i+1] + 1;
441           vi = aj + adiag[i+1] + 1;
442 
443           /* number of elements NOT on the diagonal */
444           nz = adiag[i] - adiag[i+1]-1;
445 
446           /* decrement the offset */
447           offset -= (nz+1);
448 
449           /* first, set the diagonal elements */
450           AjUp[offset] = (PetscInt) i;
451           AAUp[offset] = (MatScalar)1./v[nz];
452           AiUp[i]      = AiUp[i+1] - (nz+1);
453 
454           PetscCall(PetscArraycpy(&(AjUp[offset+1]), vi, nz));
455           PetscCall(PetscArraycpy(&(AAUp[offset+1]), v, nz));
456         }
457 
458         /* allocate space for the triangular factor information */
459         PetscCall(PetscNew(&upTriFactor));
460         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
461 
462         /* Create the matrix description */
463         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
464         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
465        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
466         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
467        #else
468         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
469        #endif
470         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
471         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
472 
473         /* set the operation */
474         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
475 
476         /* set the matrix */
477         upTriFactor->csrMat = new CsrMatrix;
478         upTriFactor->csrMat->num_rows = n;
479         upTriFactor->csrMat->num_cols = n;
480         upTriFactor->csrMat->num_entries = nzUpper;
481 
482         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
483         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
484 
485         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
486         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
487 
488         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
489         upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
490 
491         /* Create the solve analysis information */
492         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
493         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
494       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
495         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
496                                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
497                                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
498                                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
499                                                &upTriFactor->solveBufferSize));
500         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize));
501       #endif
502 
503         /* perform the solve analysis */
504         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
505                                          upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
506                                          upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
507                                          upTriFactor->csrMat->column_indices->data().get(),
508                                          #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
509                                          upTriFactor->solveInfo,
510                                          upTriFactor->solvePolicy, upTriFactor->solveBuffer));
511                                          #else
512                                          upTriFactor->solveInfo));
513                                          #endif
514         PetscCallCUDA(WaitForCUDA());
515         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
516 
517         /* assign the pointer */
518         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
519         upTriFactor->AA_h = AAUp;
520         PetscCallCUDA(cudaFreeHost(AiUp));
521         PetscCallCUDA(cudaFreeHost(AjUp));
522         PetscCall(PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar)));
523       } else {
524         if (!upTriFactor->AA_h) {
525           PetscCallCUDA(cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar)));
526         }
527         /* Fill the upper triangular matrix */
528         offset = nzUpper;
529         for (i=n-1; i>=0; i--) {
530           v  = aa + adiag[i+1] + 1;
531 
532           /* number of elements NOT on the diagonal */
533           nz = adiag[i] - adiag[i+1]-1;
534 
535           /* decrement the offset */
536           offset -= (nz+1);
537 
538           /* first, set the diagonal elements */
539           upTriFactor->AA_h[offset] = 1./v[nz];
540           PetscCall(PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz));
541         }
542         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper);
543         PetscCall(PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar)));
544       }
545     } catch(char *ex) {
546       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
547     }
548   }
549   PetscFunctionReturn(0);
550 }
551 
552 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
553 {
554   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
555   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
556   IS                           isrow = a->row,iscol = a->icol;
557   PetscBool                    row_identity,col_identity;
558   PetscInt                     n = A->rmap->n;
559 
560   PetscFunctionBegin;
561   PetscCheck(cusparseTriFactors,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
562   PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A));
563   PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A));
564 
565   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
566   cusparseTriFactors->nnz=a->nz;
567 
568   A->offloadmask = PETSC_OFFLOAD_BOTH;
569   /* lower triangular indices */
570   PetscCall(ISIdentity(isrow,&row_identity));
571   if (!row_identity && !cusparseTriFactors->rpermIndices) {
572     const PetscInt *r;
573 
574     PetscCall(ISGetIndices(isrow,&r));
575     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
576     cusparseTriFactors->rpermIndices->assign(r, r+n);
577     PetscCall(ISRestoreIndices(isrow,&r));
578     PetscCall(PetscLogCpuToGpu(n*sizeof(PetscInt)));
579   }
580 
581   /* upper triangular indices */
582   PetscCall(ISIdentity(iscol,&col_identity));
583   if (!col_identity && !cusparseTriFactors->cpermIndices) {
584     const PetscInt *c;
585 
586     PetscCall(ISGetIndices(iscol,&c));
587     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
588     cusparseTriFactors->cpermIndices->assign(c, c+n);
589     PetscCall(ISRestoreIndices(iscol,&c));
590     PetscCall(PetscLogCpuToGpu(n*sizeof(PetscInt)));
591   }
592   PetscFunctionReturn(0);
593 }
594 
595 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
596 {
597   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
598   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
599   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
600   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
601   PetscInt                          *AiUp, *AjUp;
602   PetscScalar                       *AAUp;
603   PetscScalar                       *AALo;
604   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
605   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
606   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
607   const MatScalar                   *aa = b->a,*v;
608 
609   PetscFunctionBegin;
610   if (!n) PetscFunctionReturn(0);
611   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
612     try {
613       PetscCallCUDA(cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar)));
614       PetscCallCUDA(cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar)));
615       if (!upTriFactor && !loTriFactor) {
616         /* Allocate Space for the upper triangular matrix */
617         PetscCallCUDA(cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt)));
618         PetscCallCUDA(cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt)));
619 
620         /* Fill the upper triangular matrix */
621         AiUp[0]=(PetscInt) 0;
622         AiUp[n]=nzUpper;
623         offset = 0;
624         for (i=0; i<n; i++) {
625           /* set the pointers */
626           v  = aa + ai[i];
627           vj = aj + ai[i];
628           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
629 
630           /* first, set the diagonal elements */
631           AjUp[offset] = (PetscInt) i;
632           AAUp[offset] = (MatScalar)1.0/v[nz];
633           AiUp[i]      = offset;
634           AALo[offset] = (MatScalar)1.0/v[nz];
635 
636           offset+=1;
637           if (nz>0) {
638             PetscCall(PetscArraycpy(&(AjUp[offset]), vj, nz));
639             PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz));
640             for (j=offset; j<offset+nz; j++) {
641               AAUp[j] = -AAUp[j];
642               AALo[j] = AAUp[j]/v[nz];
643             }
644             offset+=nz;
645           }
646         }
647 
648         /* allocate space for the triangular factor information */
649         PetscCall(PetscNew(&upTriFactor));
650         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
651 
652         /* Create the matrix description */
653         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
654         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
655        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
656         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
657        #else
658         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
659        #endif
660         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
661         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
662 
663         /* set the matrix */
664         upTriFactor->csrMat = new CsrMatrix;
665         upTriFactor->csrMat->num_rows = A->rmap->n;
666         upTriFactor->csrMat->num_cols = A->cmap->n;
667         upTriFactor->csrMat->num_entries = a->nz;
668 
669         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
670         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
671 
672         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
673         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
674 
675         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
676         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
677 
678         /* set the operation */
679         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
680 
681         /* Create the solve analysis information */
682         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
683         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
684       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
685         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
686                                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
687                                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
688                                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
689                                                &upTriFactor->solveBufferSize));
690         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize));
691       #endif
692 
693         /* perform the solve analysis */
694         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
695                                          upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
696                                          upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
697                                          upTriFactor->csrMat->column_indices->data().get(),
698                                          #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
699                                          upTriFactor->solveInfo,
700                                          upTriFactor->solvePolicy, upTriFactor->solveBuffer));
701                                          #else
702                                          upTriFactor->solveInfo));
703                                          #endif
704         PetscCallCUDA(WaitForCUDA());
705         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
706 
707         /* assign the pointer */
708         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
709 
710         /* allocate space for the triangular factor information */
711         PetscCall(PetscNew(&loTriFactor));
712         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
713 
714         /* Create the matrix description */
715         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
716         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
717        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
718         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
719        #else
720         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
721        #endif
722         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
723         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
724 
725         /* set the operation */
726         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
727 
728         /* set the matrix */
729         loTriFactor->csrMat = new CsrMatrix;
730         loTriFactor->csrMat->num_rows = A->rmap->n;
731         loTriFactor->csrMat->num_cols = A->cmap->n;
732         loTriFactor->csrMat->num_entries = a->nz;
733 
734         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
735         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
736 
737         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
738         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
739 
740         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
741         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
742 
743         /* Create the solve analysis information */
744         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
745         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
746       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
747         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
748                                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
749                                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
750                                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
751                                                &loTriFactor->solveBufferSize));
752         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize));
753       #endif
754 
755         /* perform the solve analysis */
756         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
757                                          loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
758                                          loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
759                                          loTriFactor->csrMat->column_indices->data().get(),
760                                          #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
761                                          loTriFactor->solveInfo,
762                                          loTriFactor->solvePolicy, loTriFactor->solveBuffer));
763                                          #else
764                                          loTriFactor->solveInfo));
765                                          #endif
766         PetscCallCUDA(WaitForCUDA());
767         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
768 
769         /* assign the pointer */
770         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
771 
772         PetscCall(PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar))));
773         PetscCallCUDA(cudaFreeHost(AiUp));
774         PetscCallCUDA(cudaFreeHost(AjUp));
775       } else {
776         /* Fill the upper triangular matrix */
777         offset = 0;
778         for (i=0; i<n; i++) {
779           /* set the pointers */
780           v  = aa + ai[i];
781           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
782 
783           /* first, set the diagonal elements */
784           AAUp[offset] = 1.0/v[nz];
785           AALo[offset] = 1.0/v[nz];
786 
787           offset+=1;
788           if (nz>0) {
789             PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz));
790             for (j=offset; j<offset+nz; j++) {
791               AAUp[j] = -AAUp[j];
792               AALo[j] = AAUp[j]/v[nz];
793             }
794             offset+=nz;
795           }
796         }
797         PetscCheck(upTriFactor,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
798         PetscCheck(loTriFactor,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
799         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
800         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
801         PetscCall(PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar)));
802       }
803       PetscCallCUDA(cudaFreeHost(AAUp));
804       PetscCallCUDA(cudaFreeHost(AALo));
805     } catch(char *ex) {
806       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
807     }
808   }
809   PetscFunctionReturn(0);
810 }
811 
812 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
813 {
814   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
815   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
816   IS                           ip = a->row;
817   PetscBool                    perm_identity;
818   PetscInt                     n = A->rmap->n;
819 
820   PetscFunctionBegin;
821   PetscCheck(cusparseTriFactors,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
822   PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A));
823   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
824   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
825 
826   A->offloadmask = PETSC_OFFLOAD_BOTH;
827 
828   /* lower triangular indices */
829   PetscCall(ISIdentity(ip,&perm_identity));
830   if (!perm_identity) {
831     IS             iip;
832     const PetscInt *irip,*rip;
833 
834     PetscCall(ISInvertPermutation(ip,PETSC_DECIDE,&iip));
835     PetscCall(ISGetIndices(iip,&irip));
836     PetscCall(ISGetIndices(ip,&rip));
837     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
838     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
839     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
840     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
841     PetscCall(ISRestoreIndices(iip,&irip));
842     PetscCall(ISDestroy(&iip));
843     PetscCall(ISRestoreIndices(ip,&rip));
844     PetscCall(PetscLogCpuToGpu(2.*n*sizeof(PetscInt)));
845   }
846   PetscFunctionReturn(0);
847 }
848 
849 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
850 {
851   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
852   IS             ip = b->row;
853   PetscBool      perm_identity;
854 
855   PetscFunctionBegin;
856   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
857   PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B,A,info));
858   B->offloadmask = PETSC_OFFLOAD_CPU;
859   /* determine which version of MatSolve needs to be used. */
860   PetscCall(ISIdentity(ip,&perm_identity));
861   if (perm_identity) {
862     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
863     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
864     B->ops->matsolve = NULL;
865     B->ops->matsolvetranspose = NULL;
866   } else {
867     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
868     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
869     B->ops->matsolve = NULL;
870     B->ops->matsolvetranspose = NULL;
871   }
872 
873   /* get the triangular factors */
874   PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B));
875   PetscFunctionReturn(0);
876 }
877 
878 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
879 {
880   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
881   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
882   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
883   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
884   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
885   cusparseIndexBase_t               indexBase;
886   cusparseMatrixType_t              matrixType;
887   cusparseFillMode_t                fillMode;
888   cusparseDiagType_t                diagType;
889 
890   PetscFunctionBegin;
891   /* allocate space for the transpose of the lower triangular factor */
892   PetscCall(PetscNew(&loTriFactorT));
893   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
894 
895   /* set the matrix descriptors of the lower triangular factor */
896   matrixType = cusparseGetMatType(loTriFactor->descr);
897   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
898   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
899     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
900   diagType = cusparseGetMatDiagType(loTriFactor->descr);
901 
902   /* Create the matrix description */
903   PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr));
904   PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase));
905   PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType));
906   PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode));
907   PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType));
908 
909   /* set the operation */
910   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
911 
912   /* allocate GPU space for the CSC of the lower triangular factor*/
913   loTriFactorT->csrMat = new CsrMatrix;
914   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
915   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
916   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
917   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
918   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
919   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
920 
921   /* compute the transpose of the lower triangular factor, i.e. the CSC */
922 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
923   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
924                                                loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
925                                                loTriFactor->csrMat->values->data().get(),
926                                                loTriFactor->csrMat->row_offsets->data().get(),
927                                                loTriFactor->csrMat->column_indices->data().get(),
928                                                loTriFactorT->csrMat->values->data().get(),
929                                                loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
930                                                CUSPARSE_ACTION_NUMERIC,indexBase,
931                                                CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize));
932   PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize));
933 #endif
934 
935   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
936   PetscCallCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
937                                   loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
938                                   loTriFactor->csrMat->values->data().get(),
939                                   loTriFactor->csrMat->row_offsets->data().get(),
940                                   loTriFactor->csrMat->column_indices->data().get(),
941                                   loTriFactorT->csrMat->values->data().get(),
942                                   #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
943                                   loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
944                                   CUSPARSE_ACTION_NUMERIC, indexBase,
945                                   CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer));
946                                   #else
947                                   loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
948                                   CUSPARSE_ACTION_NUMERIC, indexBase));
949                                   #endif
950   PetscCallCUDA(WaitForCUDA());
951   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
952 
953   /* Create the solve analysis information */
954   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
955   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactorT->solveInfo));
956 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
957   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
958                                          loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
959                                          loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
960                                          loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
961                                          &loTriFactorT->solveBufferSize));
962   PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize));
963 #endif
964 
965   /* perform the solve analysis */
966   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
967                                    loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
968                                    loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
969                                    loTriFactorT->csrMat->column_indices->data().get(),
970                                    #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
971                                    loTriFactorT->solveInfo,
972                                    loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
973                                    #else
974                                    loTriFactorT->solveInfo));
975                                    #endif
976   PetscCallCUDA(WaitForCUDA());
977   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
978 
979   /* assign the pointer */
980   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
981 
982   /*********************************************/
983   /* Now the Transpose of the Upper Tri Factor */
984   /*********************************************/
985 
986   /* allocate space for the transpose of the upper triangular factor */
987   PetscCall(PetscNew(&upTriFactorT));
988   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
989 
990   /* set the matrix descriptors of the upper triangular factor */
991   matrixType = cusparseGetMatType(upTriFactor->descr);
992   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
993   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
994     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
995   diagType = cusparseGetMatDiagType(upTriFactor->descr);
996 
997   /* Create the matrix description */
998   PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr));
999   PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase));
1000   PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType));
1001   PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode));
1002   PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType));
1003 
1004   /* set the operation */
1005   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1006 
1007   /* allocate GPU space for the CSC of the upper triangular factor*/
1008   upTriFactorT->csrMat = new CsrMatrix;
1009   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1010   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1011   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1012   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1013   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1014   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1015 
1016   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1017 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1018   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1019                                                upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1020                                                upTriFactor->csrMat->values->data().get(),
1021                                                upTriFactor->csrMat->row_offsets->data().get(),
1022                                                upTriFactor->csrMat->column_indices->data().get(),
1023                                                upTriFactorT->csrMat->values->data().get(),
1024                                                upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1025                                                CUSPARSE_ACTION_NUMERIC,indexBase,
1026                                                CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize));
1027   PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize));
1028 #endif
1029 
1030   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
1031   PetscCallCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1032                                   upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1033                                   upTriFactor->csrMat->values->data().get(),
1034                                   upTriFactor->csrMat->row_offsets->data().get(),
1035                                   upTriFactor->csrMat->column_indices->data().get(),
1036                                   upTriFactorT->csrMat->values->data().get(),
1037                                   #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1038                                   upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1039                                   CUSPARSE_ACTION_NUMERIC, indexBase,
1040                                   CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer));
1041                                   #else
1042                                   upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1043                                  CUSPARSE_ACTION_NUMERIC, indexBase));
1044                                  #endif
1045 
1046   PetscCallCUDA(WaitForCUDA());
1047   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
1048 
1049   /* Create the solve analysis information */
1050   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
1051   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactorT->solveInfo));
1052   #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1053   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1054                                          upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1055                                          upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1056                                          upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
1057                                          &upTriFactorT->solveBufferSize));
1058   PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize));
1059   #endif
1060 
1061   /* perform the solve analysis */
1062   /* christ, would it have killed you to put this stuff in a function????????? */
1063   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1064                                    upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1065                                    upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1066                                    upTriFactorT->csrMat->column_indices->data().get(),
1067                                    #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1068                                    upTriFactorT->solveInfo,
1069                                    upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1070                                    #else
1071                                    upTriFactorT->solveInfo));
1072                                    #endif
1073 
1074   PetscCallCUDA(WaitForCUDA());
1075   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
1076 
1077   /* assign the pointer */
1078   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1079   PetscFunctionReturn(0);
1080 }
1081 
1082 struct PetscScalarToPetscInt
1083 {
1084   __host__ __device__
1085   PetscInt operator()(PetscScalar s)
1086   {
1087     return (PetscInt)PetscRealPart(s);
1088   }
1089 };
1090 
1091 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A)
1092 {
1093   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1094   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1095   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1096   cusparseStatus_t             stat;
1097   cusparseIndexBase_t          indexBase;
1098 
1099   PetscFunctionBegin;
1100   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1101   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1102   PetscCheck(matstruct,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing mat struct");
1103   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1104   PetscCheck(!A->transupdated || matstructT,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing matTranspose struct");
1105   if (A->transupdated) PetscFunctionReturn(0);
1106   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
1107   PetscCall(PetscLogGpuTimeBegin());
1108   if (cusparsestruct->format != MAT_CUSPARSE_CSR) {
1109     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE));
1110   }
1111   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1112     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1113     PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr));
1114     indexBase = cusparseGetMatIndexBase(matstruct->descr);
1115     PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase));
1116     PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
1117 
1118     /* set alpha and beta */
1119     PetscCallCUDA(cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar)));
1120     PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar)));
1121     PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar)));
1122     PetscCallCUDA(cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
1123     PetscCallCUDA(cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice));
1124     PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
1125 
1126     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1127       CsrMatrix *matrixT = new CsrMatrix;
1128       matstructT->mat = matrixT;
1129       matrixT->num_rows = A->cmap->n;
1130       matrixT->num_cols = A->rmap->n;
1131       matrixT->num_entries = a->nz;
1132       matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1133       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1134       matrixT->values = new THRUSTARRAY(a->nz);
1135 
1136       if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); }
1137       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
1138 
1139      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1140       #if PETSC_PKG_CUDA_VERSION_GE(11,2,1)
1141         stat = cusparseCreateCsr(&matstructT->matDescr,
1142                                matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1143                                matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1144                                matrixT->values->data().get(),
1145                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1146                                indexBase,cusparse_scalartype);PetscCallCUSPARSE(stat);
1147       #else
1148         /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1,
1149            see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1
1150 
1151            I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set
1152            it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2,
1153            when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly.
1154         */
1155         if (matrixT->num_entries) {
1156           stat = cusparseCreateCsr(&matstructT->matDescr,
1157                                  matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1158                                  matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1159                                  matrixT->values->data().get(),
1160                                  CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I,
1161                                  indexBase,cusparse_scalartype);PetscCallCUSPARSE(stat);
1162 
1163         } else {
1164           matstructT->matDescr = NULL;
1165           matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
1166         }
1167       #endif
1168      #endif
1169     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1170    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1171       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1172    #else
1173       CsrMatrix *temp  = new CsrMatrix;
1174       CsrMatrix *tempT = new CsrMatrix;
1175       /* First convert HYB to CSR */
1176       temp->num_rows = A->rmap->n;
1177       temp->num_cols = A->cmap->n;
1178       temp->num_entries = a->nz;
1179       temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1180       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1181       temp->values = new THRUSTARRAY(a->nz);
1182 
1183       stat = cusparse_hyb2csr(cusparsestruct->handle,
1184                               matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1185                               temp->values->data().get(),
1186                               temp->row_offsets->data().get(),
1187                               temp->column_indices->data().get());PetscCallCUSPARSE(stat);
1188 
1189       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1190       tempT->num_rows = A->rmap->n;
1191       tempT->num_cols = A->cmap->n;
1192       tempT->num_entries = a->nz;
1193       tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1194       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1195       tempT->values = new THRUSTARRAY(a->nz);
1196 
1197       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1198                               temp->num_cols, temp->num_entries,
1199                               temp->values->data().get(),
1200                               temp->row_offsets->data().get(),
1201                               temp->column_indices->data().get(),
1202                               tempT->values->data().get(),
1203                               tempT->column_indices->data().get(),
1204                               tempT->row_offsets->data().get(),
1205                               CUSPARSE_ACTION_NUMERIC, indexBase);PetscCallCUSPARSE(stat);
1206 
1207       /* Last, convert CSC to HYB */
1208       cusparseHybMat_t hybMat;
1209       PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
1210       cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1211         CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1212       stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1213                               matstructT->descr, tempT->values->data().get(),
1214                               tempT->row_offsets->data().get(),
1215                               tempT->column_indices->data().get(),
1216                               hybMat, 0, partition);PetscCallCUSPARSE(stat);
1217 
1218       /* assign the pointer */
1219       matstructT->mat = hybMat;
1220       A->transupdated = PETSC_TRUE;
1221       /* delete temporaries */
1222       if (tempT) {
1223         if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1224         if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1225         if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1226         delete (CsrMatrix*) tempT;
1227       }
1228       if (temp) {
1229         if (temp->values) delete (THRUSTARRAY*) temp->values;
1230         if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1231         if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1232         delete (CsrMatrix*) temp;
1233       }
1234      #endif
1235     }
1236   }
1237   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1238     CsrMatrix *matrix  = (CsrMatrix*)matstruct->mat;
1239     CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat;
1240     PetscCheck(matrix,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix");
1241     PetscCheck(matrix->row_offsets,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix rows");
1242     PetscCheck(matrix->column_indices,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix cols");
1243     PetscCheck(matrix->values,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix values");
1244     PetscCheck(matrixT,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT");
1245     PetscCheck(matrixT->row_offsets,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT rows");
1246     PetscCheck(matrixT->column_indices,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT cols");
1247     PetscCheck(matrixT->values,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT values");
1248     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1249       cusparsestruct->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
1250       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
1251       PetscCall(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt)));
1252     }
1253     if (!cusparsestruct->csr2csc_i) {
1254       THRUSTARRAY csr2csc_a(matrix->num_entries);
1255       PetscCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1256 
1257       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1258      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1259       void   *csr2cscBuffer;
1260       size_t csr2cscBufferSize;
1261       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1262                                            A->cmap->n, matrix->num_entries,
1263                                            matrix->values->data().get(),
1264                                            cusparsestruct->rowoffsets_gpu->data().get(),
1265                                            matrix->column_indices->data().get(),
1266                                            matrixT->values->data().get(),
1267                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1268                                            CUSPARSE_ACTION_NUMERIC,indexBase,
1269                                            cusparsestruct->csr2cscAlg, &csr2cscBufferSize);PetscCallCUSPARSE(stat);
1270       PetscCallCUDA(cudaMalloc(&csr2cscBuffer,csr2cscBufferSize));
1271      #endif
1272 
1273       if (matrix->num_entries) {
1274         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
1275            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
1276            I checked every parameters and they were just fine. I have no clue why cusparse complains.
1277 
1278            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
1279            should be filled with indexBase. So I just take a shortcut here.
1280         */
1281         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1282                               A->cmap->n,matrix->num_entries,
1283                               csr2csc_a.data().get(),
1284                               cusparsestruct->rowoffsets_gpu->data().get(),
1285                               matrix->column_indices->data().get(),
1286                               matrixT->values->data().get(),
1287                              #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1288                               matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1289                               CUSPARSE_ACTION_NUMERIC,indexBase,
1290                               cusparsestruct->csr2cscAlg, csr2cscBuffer);PetscCallCUSPARSE(stat);
1291                              #else
1292                               matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1293                               CUSPARSE_ACTION_NUMERIC, indexBase);PetscCallCUSPARSE(stat);
1294                              #endif
1295       } else {
1296         matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
1297       }
1298 
1299       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1300       PetscCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt()));
1301      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1302       PetscCallCUDA(cudaFree(csr2cscBuffer));
1303      #endif
1304     }
1305     PetscCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()),
1306                                                      thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()),
1307                                                      matrixT->values->begin()));
1308   }
1309   PetscCall(PetscLogGpuTimeEnd());
1310   PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
1311   /* the compressed row indices is not used for matTranspose */
1312   matstructT->cprowIndices = NULL;
1313   /* assign the pointer */
1314   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1315   A->transupdated = PETSC_TRUE;
1316   PetscFunctionReturn(0);
1317 }
1318 
1319 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1320 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1321 {
1322   PetscInt                              n = xx->map->n;
1323   const PetscScalar                     *barray;
1324   PetscScalar                           *xarray;
1325   thrust::device_ptr<const PetscScalar> bGPU;
1326   thrust::device_ptr<PetscScalar>       xGPU;
1327   cusparseStatus_t                      stat;
1328   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1329   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1330   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1331   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1332 
1333   PetscFunctionBegin;
1334   /* Analyze the matrix and create the transpose ... on the fly */
1335   if (!loTriFactorT && !upTriFactorT) {
1336     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1337     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1338     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1339   }
1340 
1341   /* Get the GPU pointers */
1342   PetscCall(VecCUDAGetArrayWrite(xx,&xarray));
1343   PetscCall(VecCUDAGetArrayRead(bb,&barray));
1344   xGPU = thrust::device_pointer_cast(xarray);
1345   bGPU = thrust::device_pointer_cast(barray);
1346 
1347   PetscCall(PetscLogGpuTimeBegin());
1348   /* First, reorder with the row permutation */
1349   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1350                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1351                xGPU);
1352 
1353   /* First, solve U */
1354   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1355                         upTriFactorT->csrMat->num_rows,
1356                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1357                         upTriFactorT->csrMat->num_entries,
1358                       #endif
1359                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1360                         upTriFactorT->csrMat->values->data().get(),
1361                         upTriFactorT->csrMat->row_offsets->data().get(),
1362                         upTriFactorT->csrMat->column_indices->data().get(),
1363                         upTriFactorT->solveInfo,
1364                         xarray,
1365                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1366                         tempGPU->data().get(),
1367                         upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);PetscCallCUSPARSE(stat);
1368                       #else
1369                         tempGPU->data().get());PetscCallCUSPARSE(stat);
1370                       #endif
1371 
1372   /* Then, solve L */
1373   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1374                         loTriFactorT->csrMat->num_rows,
1375                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1376                         loTriFactorT->csrMat->num_entries,
1377                       #endif
1378                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1379                         loTriFactorT->csrMat->values->data().get(),
1380                         loTriFactorT->csrMat->row_offsets->data().get(),
1381                         loTriFactorT->csrMat->column_indices->data().get(),
1382                         loTriFactorT->solveInfo,
1383                         tempGPU->data().get(),
1384                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1385                         xarray,
1386                         loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);PetscCallCUSPARSE(stat);
1387                       #else
1388                          xarray);PetscCallCUSPARSE(stat);
1389                       #endif
1390 
1391   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1392   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1393                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1394                tempGPU->begin());
1395 
1396   /* Copy the temporary to the full solution. */
1397   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU);
1398 
1399   /* restore */
1400   PetscCall(VecCUDARestoreArrayRead(bb,&barray));
1401   PetscCall(VecCUDARestoreArrayWrite(xx,&xarray));
1402   PetscCall(PetscLogGpuTimeEnd());
1403   PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n));
1404   PetscFunctionReturn(0);
1405 }
1406 
1407 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1408 {
1409   const PetscScalar                 *barray;
1410   PetscScalar                       *xarray;
1411   cusparseStatus_t                  stat;
1412   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1413   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1414   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1415   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1416 
1417   PetscFunctionBegin;
1418   /* Analyze the matrix and create the transpose ... on the fly */
1419   if (!loTriFactorT && !upTriFactorT) {
1420     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1421     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1422     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1423   }
1424 
1425   /* Get the GPU pointers */
1426   PetscCall(VecCUDAGetArrayWrite(xx,&xarray));
1427   PetscCall(VecCUDAGetArrayRead(bb,&barray));
1428 
1429   PetscCall(PetscLogGpuTimeBegin());
1430   /* First, solve U */
1431   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1432                         upTriFactorT->csrMat->num_rows,
1433                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1434                         upTriFactorT->csrMat->num_entries,
1435                       #endif
1436                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1437                         upTriFactorT->csrMat->values->data().get(),
1438                         upTriFactorT->csrMat->row_offsets->data().get(),
1439                         upTriFactorT->csrMat->column_indices->data().get(),
1440                         upTriFactorT->solveInfo,
1441                         barray,
1442                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1443                         tempGPU->data().get(),
1444                         upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);PetscCallCUSPARSE(stat);
1445                       #else
1446                         tempGPU->data().get());PetscCallCUSPARSE(stat);
1447                       #endif
1448 
1449   /* Then, solve L */
1450   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1451                         loTriFactorT->csrMat->num_rows,
1452                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1453                         loTriFactorT->csrMat->num_entries,
1454                       #endif
1455                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1456                         loTriFactorT->csrMat->values->data().get(),
1457                         loTriFactorT->csrMat->row_offsets->data().get(),
1458                         loTriFactorT->csrMat->column_indices->data().get(),
1459                         loTriFactorT->solveInfo,
1460                         tempGPU->data().get(),
1461                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1462                         xarray,
1463                         loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);PetscCallCUSPARSE(stat);
1464                       #else
1465                         xarray);PetscCallCUSPARSE(stat);
1466                       #endif
1467 
1468   /* restore */
1469   PetscCall(VecCUDARestoreArrayRead(bb,&barray));
1470   PetscCall(VecCUDARestoreArrayWrite(xx,&xarray));
1471   PetscCall(PetscLogGpuTimeEnd());
1472   PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n));
1473   PetscFunctionReturn(0);
1474 }
1475 
1476 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1477 {
1478   const PetscScalar                     *barray;
1479   PetscScalar                           *xarray;
1480   thrust::device_ptr<const PetscScalar> bGPU;
1481   thrust::device_ptr<PetscScalar>       xGPU;
1482   cusparseStatus_t                      stat;
1483   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1484   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1485   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1486   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1487 
1488   PetscFunctionBegin;
1489 
1490   /* Get the GPU pointers */
1491   PetscCall(VecCUDAGetArrayWrite(xx,&xarray));
1492   PetscCall(VecCUDAGetArrayRead(bb,&barray));
1493   xGPU = thrust::device_pointer_cast(xarray);
1494   bGPU = thrust::device_pointer_cast(barray);
1495 
1496   PetscCall(PetscLogGpuTimeBegin());
1497   /* First, reorder with the row permutation */
1498   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1499                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1500                tempGPU->begin());
1501 
1502   /* Next, solve L */
1503   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1504                         loTriFactor->csrMat->num_rows,
1505                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1506                         loTriFactor->csrMat->num_entries,
1507                       #endif
1508                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1509                         loTriFactor->csrMat->values->data().get(),
1510                         loTriFactor->csrMat->row_offsets->data().get(),
1511                         loTriFactor->csrMat->column_indices->data().get(),
1512                         loTriFactor->solveInfo,
1513                         tempGPU->data().get(),
1514                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1515                          xarray,
1516                          loTriFactor->solvePolicy, loTriFactor->solveBuffer);PetscCallCUSPARSE(stat);
1517                       #else
1518                          xarray);PetscCallCUSPARSE(stat);
1519                       #endif
1520 
1521   /* Then, solve U */
1522   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1523                         upTriFactor->csrMat->num_rows,
1524                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1525                         upTriFactor->csrMat->num_entries,
1526                       #endif
1527                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1528                         upTriFactor->csrMat->values->data().get(),
1529                         upTriFactor->csrMat->row_offsets->data().get(),
1530                         upTriFactor->csrMat->column_indices->data().get(),
1531                         upTriFactor->solveInfo,xarray,
1532                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1533                         tempGPU->data().get(),
1534                         upTriFactor->solvePolicy, upTriFactor->solveBuffer);PetscCallCUSPARSE(stat);
1535                       #else
1536                         tempGPU->data().get());PetscCallCUSPARSE(stat);
1537                       #endif
1538 
1539   /* Last, reorder with the column permutation */
1540   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1541                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
1542                xGPU);
1543 
1544   PetscCall(VecCUDARestoreArrayRead(bb,&barray));
1545   PetscCall(VecCUDARestoreArrayWrite(xx,&xarray));
1546   PetscCall(PetscLogGpuTimeEnd());
1547   PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n));
1548   PetscFunctionReturn(0);
1549 }
1550 
1551 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1552 {
1553   const PetscScalar                 *barray;
1554   PetscScalar                       *xarray;
1555   cusparseStatus_t                  stat;
1556   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1557   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1558   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1559   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1560 
1561   PetscFunctionBegin;
1562   /* Get the GPU pointers */
1563   PetscCall(VecCUDAGetArrayWrite(xx,&xarray));
1564   PetscCall(VecCUDAGetArrayRead(bb,&barray));
1565 
1566   PetscCall(PetscLogGpuTimeBegin());
1567   /* First, solve L */
1568   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1569                         loTriFactor->csrMat->num_rows,
1570                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1571                         loTriFactor->csrMat->num_entries,
1572                       #endif
1573                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1574                         loTriFactor->csrMat->values->data().get(),
1575                         loTriFactor->csrMat->row_offsets->data().get(),
1576                         loTriFactor->csrMat->column_indices->data().get(),
1577                         loTriFactor->solveInfo,
1578                         barray,
1579                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1580                         tempGPU->data().get(),
1581                         loTriFactor->solvePolicy,loTriFactor->solveBuffer);PetscCallCUSPARSE(stat);
1582                       #else
1583                         tempGPU->data().get());PetscCallCUSPARSE(stat);
1584                       #endif
1585 
1586   /* Next, solve U */
1587   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1588                         upTriFactor->csrMat->num_rows,
1589                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1590                         upTriFactor->csrMat->num_entries,
1591                       #endif
1592                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1593                         upTriFactor->csrMat->values->data().get(),
1594                         upTriFactor->csrMat->row_offsets->data().get(),
1595                         upTriFactor->csrMat->column_indices->data().get(),
1596                         upTriFactor->solveInfo,
1597                         tempGPU->data().get(),
1598                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1599                         xarray,
1600                         upTriFactor->solvePolicy, upTriFactor->solveBuffer);PetscCallCUSPARSE(stat);
1601                       #else
1602                         xarray);PetscCallCUSPARSE(stat);
1603                       #endif
1604 
1605   PetscCall(VecCUDARestoreArrayRead(bb,&barray));
1606   PetscCall(VecCUDARestoreArrayWrite(xx,&xarray));
1607   PetscCall(PetscLogGpuTimeEnd());
1608   PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n));
1609   PetscFunctionReturn(0);
1610 }
1611 
1612 #if CUSPARSE_VERSION >= 11500
1613 /* cusparseSpSV_solve() and friends first appeared in cusparse-11.3 */
1614 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ILU0(Mat fact,Vec b,Vec x)
1615 {
1616   Mat_SeqAIJCUSPARSETriFactors  *fs = (Mat_SeqAIJCUSPARSETriFactors*)fact->spptr;
1617   Mat_SeqAIJ                    *aij = (Mat_SeqAIJ*)fact->data;
1618   const PetscScalar             *barray;
1619   PetscScalar                   *xarray;
1620 
1621   PetscFunctionBegin;
1622   PetscCall(VecCUDAGetArrayWrite(x,&xarray));
1623   PetscCall(VecCUDAGetArrayRead(b,&barray));
1624   PetscCall(PetscLogGpuTimeBegin());
1625 
1626   /* Solve L*y = b */
1627   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X,(void*)barray));
1628   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y,fs->Y));
1629   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle,
1630                     CUSPARSE_OPERATION_NON_TRANSPOSE,
1631                     &PETSC_CUSPARSE_ONE,
1632                     fs->spMatDescr_L, /* L Y = X */
1633                     fs->dnVecDescr_X,
1634                     fs->dnVecDescr_Y,
1635                     cusparse_scalartype,
1636                     CUSPARSE_SPSV_ALG_DEFAULT,
1637                     fs->spsvDescr_L)); // cusparseSpSV_solve() scretely uses the external buffer used in cusparseSpSV_analysis()!
1638 
1639   /* Solve U*x = y */
1640   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X,xarray));
1641   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle,
1642                     CUSPARSE_OPERATION_NON_TRANSPOSE,
1643                     &PETSC_CUSPARSE_ONE,
1644                     fs->spMatDescr_U, /* U X = Y */
1645                     fs->dnVecDescr_Y,
1646                     fs->dnVecDescr_X,
1647                     cusparse_scalartype,
1648                     CUSPARSE_SPSV_ALG_DEFAULT,
1649                     fs->spsvDescr_U));
1650 
1651   PetscCall(VecCUDARestoreArrayRead(b,&barray));
1652   PetscCall(VecCUDARestoreArrayWrite(x,&xarray));
1653 
1654   PetscCall(PetscLogGpuTimeEnd());
1655   PetscCall(PetscLogGpuFlops(2.0*aij->nz - fact->rmap->n));
1656   PetscFunctionReturn(0);
1657 }
1658 
1659 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_ILU0(Mat fact,Vec b,Vec x)
1660 {
1661   Mat_SeqAIJCUSPARSETriFactors  *fs = (Mat_SeqAIJCUSPARSETriFactors*)fact->spptr;
1662   Mat_SeqAIJ                    *aij = (Mat_SeqAIJ*)fact->data;
1663   const PetscScalar             *barray;
1664   PetscScalar                   *xarray;
1665 
1666   PetscFunctionBegin;
1667   if (!fs->createdTransposeSpSVDescr) { /* Call MatSolveTranspose() for the first time */
1668     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
1669     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle,
1670                       CUSPARSE_OPERATION_TRANSPOSE,
1671                       &PETSC_CUSPARSE_ONE,
1672                       fs->spMatDescr_L, /* The matrix is still L. We only do tranpose solve with it */
1673                       fs->dnVecDescr_X,
1674                       fs->dnVecDescr_Y,
1675                       cusparse_scalartype,
1676                       CUSPARSE_SPSV_ALG_DEFAULT,
1677                       fs->spsvDescr_Lt,
1678                       &fs->spsvBufferSize_Lt));
1679 
1680     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut));
1681     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle,
1682                       CUSPARSE_OPERATION_TRANSPOSE,
1683                       &PETSC_CUSPARSE_ONE,
1684                       fs->spMatDescr_U,
1685                       fs->dnVecDescr_X,
1686                       fs->dnVecDescr_Y,
1687                       cusparse_scalartype,
1688                       CUSPARSE_SPSV_ALG_DEFAULT,
1689                       fs->spsvDescr_Ut,
1690                       &fs->spsvBufferSize_Ut));
1691     PetscCallCUDA(cudaMalloc((void**)&fs->spsvBuffer_Lt,fs->spsvBufferSize_Lt));
1692     PetscCallCUDA(cudaMalloc((void**)&fs->spsvBuffer_Ut,fs->spsvBufferSize_Ut));
1693     fs->createdTransposeSpSVDescr = PETSC_TRUE;
1694   }
1695 
1696   if (!fs->updatedTransposeSpSVAnalysis) {
1697     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle,
1698                     CUSPARSE_OPERATION_TRANSPOSE,
1699                     &PETSC_CUSPARSE_ONE,
1700                     fs->spMatDescr_L,
1701                     fs->dnVecDescr_X,
1702                     fs->dnVecDescr_Y,
1703                     cusparse_scalartype,
1704                     CUSPARSE_SPSV_ALG_DEFAULT,
1705                     fs->spsvDescr_Lt,
1706                     fs->spsvBuffer_Lt));
1707 
1708     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle,
1709                     CUSPARSE_OPERATION_TRANSPOSE,
1710                     &PETSC_CUSPARSE_ONE,
1711                     fs->spMatDescr_U,
1712                     fs->dnVecDescr_X,
1713                     fs->dnVecDescr_Y,
1714                     cusparse_scalartype,
1715                     CUSPARSE_SPSV_ALG_DEFAULT,
1716                     fs->spsvDescr_Ut,
1717                     fs->spsvBuffer_Ut));
1718     fs->updatedTransposeSpSVAnalysis = PETSC_TRUE;
1719   }
1720 
1721   PetscCall(VecCUDAGetArrayWrite(x,&xarray));
1722   PetscCall(VecCUDAGetArrayRead(b,&barray));
1723   PetscCall(PetscLogGpuTimeBegin());
1724 
1725   /* Solve Ut*y = b */
1726   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X,(void*)barray));
1727   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y,fs->Y));
1728   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle,
1729                     CUSPARSE_OPERATION_TRANSPOSE,
1730                     &PETSC_CUSPARSE_ONE,
1731                     fs->spMatDescr_U, /* Ut Y = X */
1732                     fs->dnVecDescr_X,
1733                     fs->dnVecDescr_Y,
1734                     cusparse_scalartype,
1735                     CUSPARSE_SPSV_ALG_DEFAULT,
1736                     fs->spsvDescr_Ut));
1737 
1738   /* Solve Lt*x = y */
1739   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X,xarray));
1740   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle,
1741                     CUSPARSE_OPERATION_TRANSPOSE,
1742                     &PETSC_CUSPARSE_ONE,
1743                     fs->spMatDescr_L, /* Lt X = Y */
1744                     fs->dnVecDescr_Y,
1745                     fs->dnVecDescr_X,
1746                     cusparse_scalartype,
1747                     CUSPARSE_SPSV_ALG_DEFAULT,
1748                     fs->spsvDescr_Lt));
1749 
1750   PetscCall(VecCUDARestoreArrayRead(b,&barray));
1751   PetscCall(VecCUDARestoreArrayWrite(x,&xarray));
1752   PetscCall(PetscLogGpuTimeEnd());
1753   PetscCall(PetscLogGpuFlops(2.0*aij->nz - fact->rmap->n));
1754   PetscFunctionReturn(0);
1755 }
1756 
1757 static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact,Mat A,const MatFactorInfo *info)
1758 {
1759   Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors*)fact->spptr;
1760   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ*)fact->data;
1761   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1762   CsrMatrix                    *Acsr;
1763   PetscInt                     m,nz;
1764   PetscBool                    flg;
1765 
1766   PetscFunctionBegin;
1767   if (PetscDefined(USE_DEBUG)) {
1768     PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
1769     PetscCheck(flg,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Expected MATSEQAIJCUSPARSE, but input is %s",((PetscObject)A)->type_name);
1770   }
1771 
1772   /* Copy A's value to fact */
1773   m  = fact->rmap->n;
1774   nz = aij->nz;
1775   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1776   Acsr = (CsrMatrix*)Acusp->mat->mat;
1777   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal,Acsr->values->data().get(),sizeof(PetscScalar)*nz,cudaMemcpyDeviceToDevice,PetscDefaultCudaStream));
1778 
1779   /* Factorize fact inplace */
1780   if (m) PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1781                     fs->matDescr_M,
1782                     fs->csrVal,
1783                     fs->csrRowPtr,
1784                     fs->csrColIdx,
1785                     fs->ilu0Info_M,
1786                     fs->policy_M,
1787                     fs->factBuffer_M));
1788   if (PetscDefined(USE_DEBUG)) {
1789     int              numerical_zero;
1790     cusparseStatus_t status;
1791     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero);
1792     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status,PETSC_COMM_SELF,PETSC_ERR_USER_INPUT,"Numerical zero pivot detected in csrilu02: A(%d,%d) is zero",numerical_zero,numerical_zero);
1793   }
1794 
1795   /* cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02()
1796      See discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/78
1797   */
1798   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle,
1799                     CUSPARSE_OPERATION_NON_TRANSPOSE,
1800                     &PETSC_CUSPARSE_ONE,
1801                     fs->spMatDescr_L,
1802                     fs->dnVecDescr_X,
1803                     fs->dnVecDescr_Y,
1804                     cusparse_scalartype,
1805                     CUSPARSE_SPSV_ALG_DEFAULT,
1806                     fs->spsvDescr_L,
1807                     fs->spsvBuffer_L));
1808 
1809   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle,
1810                     CUSPARSE_OPERATION_NON_TRANSPOSE,
1811                     &PETSC_CUSPARSE_ONE,
1812                     fs->spMatDescr_U,
1813                     fs->dnVecDescr_X,
1814                     fs->dnVecDescr_Y,
1815                     cusparse_scalartype,
1816                     CUSPARSE_SPSV_ALG_DEFAULT,
1817                     fs->spsvDescr_U,
1818                     fs->spsvBuffer_U));
1819 
1820   /* L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve */
1821   fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
1822 
1823   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1824   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_ILU0;
1825   fact->ops->solvetranspose    = MatSolveTranspose_SeqAIJCUSPARSE_ILU0;
1826   fact->ops->matsolve          = NULL;
1827   fact->ops->matsolvetranspose = NULL;
1828   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1829   PetscFunctionReturn(0);
1830 }
1831 
1832 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1833 {
1834   Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors*)fact->spptr;
1835   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ*)fact->data;
1836   PetscInt                     m,nz;
1837 
1838   PetscFunctionBegin;
1839   if (PetscDefined(USE_DEBUG)) {
1840     PetscInt  i;
1841     PetscBool flg,missing;
1842 
1843     PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
1844     PetscCheck(flg,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Expected MATSEQAIJCUSPARSE, but input is %s",((PetscObject)A)->type_name);
1845     PetscCheck(A->rmap->n == A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT,A->rmap->n,A->cmap->n);
1846     PetscCall(MatMissingDiagonal(A,&missing,&i));
1847     PetscCheck(!missing,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %" PetscInt_FMT,i);
1848   }
1849 
1850   /* Free the old stale stuff */
1851   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
1852 
1853   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1854      but they will not be used. Allocate them just for easy debugging.
1855    */
1856   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE/*malloc*/));
1857 
1858   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
1859   fact->factortype             = MAT_FACTOR_ILU;
1860   fact->info.factor_mallocs    = 0;
1861   fact->info.fill_ratio_given  = info->fill;
1862   fact->info.fill_ratio_needed = 1.0;
1863 
1864   aij->row = NULL;
1865   aij->col = NULL;
1866 
1867   /* ====================================================================== */
1868   /* Copy A's i, j to fact and also allocate the value array of fact.       */
1869   /* We'll do in-place factorization on fact                                */
1870   /* ====================================================================== */
1871   const int *Ai,*Aj;
1872 
1873   m  = fact->rmap->n;
1874   nz = aij->nz;
1875 
1876   PetscCallCUDA(cudaMalloc((void**)&fs->csrRowPtr,sizeof(int)*(m+1)));
1877   PetscCallCUDA(cudaMalloc((void**)&fs->csrColIdx,sizeof(int)*nz));
1878   PetscCallCUDA(cudaMalloc((void**)&fs->csrVal,sizeof(PetscScalar)*nz));
1879   PetscCall(MatSeqAIJCUSPARSEGetIJ(A,PETSC_FALSE,&Ai,&Aj));  /* Do not use compressed Ai */
1880   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr,Ai,sizeof(int)*(m+1),cudaMemcpyDeviceToDevice,PetscDefaultCudaStream));
1881   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx,Aj,sizeof(int)*nz,cudaMemcpyDeviceToDevice,PetscDefaultCudaStream));
1882 
1883   /* ====================================================================== */
1884   /* Create descriptors for M, L, U                                         */
1885   /* ====================================================================== */
1886   cusparseFillMode_t fillMode;
1887   cusparseDiagType_t diagType;
1888 
1889   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
1890   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
1891   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
1892 
1893   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
1894     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1895     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1896     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1897     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1898   */
1899   fillMode = CUSPARSE_FILL_MODE_LOWER;
1900   diagType = CUSPARSE_DIAG_TYPE_UNIT;
1901   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L,m,m,nz,
1902                     fs->csrRowPtr,
1903                     fs->csrColIdx,
1904                     fs->csrVal,
1905                     CUSPARSE_INDEX_32I,
1906                     CUSPARSE_INDEX_32I,
1907                     CUSPARSE_INDEX_BASE_ZERO,
1908                     cusparse_scalartype));
1909   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L,
1910                     CUSPARSE_SPMAT_FILL_MODE,
1911                     &fillMode,
1912                     sizeof(fillMode)));
1913   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L,
1914                     CUSPARSE_SPMAT_DIAG_TYPE,
1915                     &diagType,
1916                     sizeof(diagType)));
1917 
1918   fillMode = CUSPARSE_FILL_MODE_UPPER;
1919   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
1920   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U,m,m,nz,
1921                     fs->csrRowPtr,
1922                     fs->csrColIdx,
1923                     fs->csrVal,
1924                     CUSPARSE_INDEX_32I,
1925                     CUSPARSE_INDEX_32I,
1926                     CUSPARSE_INDEX_BASE_ZERO,
1927                     cusparse_scalartype));
1928   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U,
1929                     CUSPARSE_SPMAT_FILL_MODE,
1930                     &fillMode,
1931                     sizeof(fillMode)));
1932   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U,
1933                     CUSPARSE_SPMAT_DIAG_TYPE,
1934                     &diagType,
1935                     sizeof(diagType)));
1936 
1937   /* ========================================================================= */
1938   /* Query buffer sizes for csrilu0, SpSV and allocate buffers                 */
1939   /* ========================================================================= */
1940   PetscCallCUSPARSE(cusparseCreateCsrilu02Info(&fs->ilu0Info_M));
1941   if (m) PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1942                     fs->matDescr_M,
1943                     fs->csrVal,
1944                     fs->csrRowPtr,
1945                     fs->csrColIdx,
1946                     fs->ilu0Info_M,
1947                     &fs->factBufferSize_M));
1948 
1949   PetscCallCUDA(cudaMalloc((void**)&fs->X,sizeof(PetscScalar)*m));
1950   PetscCallCUDA(cudaMalloc((void**)&fs->Y,sizeof(PetscScalar)*m));
1951 
1952   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X,m,fs->X,cusparse_scalartype));
1953   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y,m,fs->Y,cusparse_scalartype));
1954 
1955   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
1956   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle,
1957                     CUSPARSE_OPERATION_NON_TRANSPOSE,
1958                     &PETSC_CUSPARSE_ONE,
1959                     fs->spMatDescr_L,
1960                     fs->dnVecDescr_X,
1961                     fs->dnVecDescr_Y,
1962                     cusparse_scalartype,
1963                     CUSPARSE_SPSV_ALG_DEFAULT,
1964                     fs->spsvDescr_L,
1965                     &fs->spsvBufferSize_L));
1966 
1967   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
1968   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle,
1969                     CUSPARSE_OPERATION_NON_TRANSPOSE,
1970                     &PETSC_CUSPARSE_ONE,
1971                     fs->spMatDescr_U,
1972                     fs->dnVecDescr_X,
1973                     fs->dnVecDescr_Y,
1974                     cusparse_scalartype,
1975                     CUSPARSE_SPSV_ALG_DEFAULT,
1976                     fs->spsvDescr_U,
1977                     &fs->spsvBufferSize_U));
1978 
1979   /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab,
1980      and discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/77,
1981      spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U.
1982      To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U.
1983    */
1984   if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) {
1985     PetscCallCUDA(cudaMalloc((void**)&fs->factBuffer_M,PetscMax(fs->spsvBufferSize_L,(size_t)fs->factBufferSize_M)));
1986     fs->spsvBuffer_L = fs->factBuffer_M;
1987     PetscCallCUDA(cudaMalloc((void**)&fs->spsvBuffer_U,fs->spsvBufferSize_U));
1988   } else {
1989     PetscCallCUDA(cudaMalloc((void**)&fs->factBuffer_M,PetscMax(fs->spsvBufferSize_U,(size_t)fs->factBufferSize_M)));
1990     fs->spsvBuffer_U = fs->factBuffer_M;
1991     PetscCallCUDA(cudaMalloc((void**)&fs->spsvBuffer_L,fs->spsvBufferSize_L));
1992   }
1993 
1994   /* ========================================================================== */
1995   /* Perform analysis of ilu0 on M, SpSv on L and U                             */
1996   /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/
1997   /* ========================================================================== */
1998   int              structural_zero;
1999   cusparseStatus_t status;
2000 
2001   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
2002   if (m) PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
2003                     fs->matDescr_M,
2004                     fs->csrVal,
2005                     fs->csrRowPtr,
2006                     fs->csrColIdx,
2007                     fs->ilu0Info_M,
2008                     fs->policy_M,
2009                     fs->factBuffer_M));
2010   if (PetscDefined(USE_DEBUG)) {
2011     /* Function cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
2012     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero);
2013     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status,PETSC_COMM_SELF,PETSC_ERR_USER_INPUT,"Structural zero pivot detected in csrilu02: A(%d,%d) is missing",structural_zero,structural_zero);
2014   }
2015 
2016   /* Estimate FLOPs of the numeric factorization */
2017  {
2018     Mat_SeqAIJ     *Aseq = (Mat_SeqAIJ*)A->data;
2019     PetscInt       *Ai,*Adiag,nzRow,nzLeft;
2020     PetscLogDouble flops = 0.0;
2021 
2022     PetscCall(MatMarkDiagonal_SeqAIJ(A));
2023     Ai    = Aseq->i;
2024     Adiag = Aseq->diag;
2025     for (PetscInt i=0; i<m; i++) {
2026       if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i+1]) { /* There are nonzeros left to the diagonal of row i */
2027         nzRow  = Ai[i+1] - Ai[i];
2028         nzLeft = Adiag[i] - Ai[i];
2029         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
2030           and include the eliminated one will be updated, which incurs a multiplication and an addition.
2031         */
2032         nzLeft = (nzRow-1)/2;
2033         flops += nzLeft*(2.0*nzRow-nzLeft+1);
2034       }
2035     }
2036     fs->numericFactFlops = flops;
2037   }
2038   fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0;
2039   PetscFunctionReturn(0);
2040 }
2041 
2042 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact,Vec b,Vec x)
2043 {
2044   Mat_SeqAIJCUSPARSETriFactors  *fs = (Mat_SeqAIJCUSPARSETriFactors*)fact->spptr;
2045   Mat_SeqAIJ                    *aij = (Mat_SeqAIJ*)fact->data;
2046   const PetscScalar             *barray;
2047   PetscScalar                   *xarray;
2048 
2049   PetscFunctionBegin;
2050   PetscCall(VecCUDAGetArrayWrite(x,&xarray));
2051   PetscCall(VecCUDAGetArrayRead(b,&barray));
2052   PetscCall(PetscLogGpuTimeBegin());
2053 
2054   /* Solve L*y = b */
2055   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X,(void*)barray));
2056   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y,fs->Y));
2057   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle,
2058                     CUSPARSE_OPERATION_NON_TRANSPOSE,
2059                     &PETSC_CUSPARSE_ONE,
2060                     fs->spMatDescr_L, /* L Y = X */
2061                     fs->dnVecDescr_X,
2062                     fs->dnVecDescr_Y,
2063                     cusparse_scalartype,
2064                     CUSPARSE_SPSV_ALG_DEFAULT,
2065                     fs->spsvDescr_L));
2066 
2067   /* Solve Lt*x = y */
2068   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X,xarray));
2069   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle,
2070                     CUSPARSE_OPERATION_TRANSPOSE,
2071                     &PETSC_CUSPARSE_ONE,
2072                     fs->spMatDescr_L, /* Lt X = Y */
2073                     fs->dnVecDescr_Y,
2074                     fs->dnVecDescr_X,
2075                     cusparse_scalartype,
2076                     CUSPARSE_SPSV_ALG_DEFAULT,
2077                     fs->spsvDescr_Lt));
2078 
2079   PetscCall(VecCUDARestoreArrayRead(b,&barray));
2080   PetscCall(VecCUDARestoreArrayWrite(x,&xarray));
2081 
2082   PetscCall(PetscLogGpuTimeEnd());
2083   PetscCall(PetscLogGpuFlops(2.0*aij->nz - fact->rmap->n));
2084   PetscFunctionReturn(0);
2085 }
2086 
2087 static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact,Mat A,const MatFactorInfo *info)
2088 {
2089   Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors*)fact->spptr;
2090   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ*)fact->data;
2091   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2092   CsrMatrix                    *Acsr;
2093   PetscInt                     m,nz;
2094   PetscBool                    flg;
2095 
2096   PetscFunctionBegin;
2097   if (PetscDefined(USE_DEBUG)) {
2098     PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
2099     PetscCheck(flg,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Expected MATSEQAIJCUSPARSE, but input is %s",((PetscObject)A)->type_name);
2100   }
2101 
2102   /* Copy A's value to fact */
2103   m  = fact->rmap->n;
2104   nz = aij->nz;
2105   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2106   Acsr = (CsrMatrix*)Acusp->mat->mat;
2107   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal,Acsr->values->data().get(),sizeof(PetscScalar)*nz,cudaMemcpyDeviceToDevice,PetscDefaultCudaStream));
2108 
2109   /* Factorize fact inplace */
2110   /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve
2111      Function csric02() only takes the lower triangular part of matrix A to perform factorization.
2112      The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored,
2113      and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not.
2114      In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided.
2115    */
2116   if (m) PetscCallCUSPARSE(cusparseXcsric02(fs->handle, m, nz,
2117                     fs->matDescr_M,
2118                     fs->csrVal,
2119                     fs->csrRowPtr,
2120                     fs->csrColIdx,
2121                     fs->ic0Info_M,
2122                     fs->policy_M,
2123                     fs->factBuffer_M));
2124   if (PetscDefined(USE_DEBUG)) {
2125     int              numerical_zero;
2126     cusparseStatus_t status;
2127     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero);
2128     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status,PETSC_COMM_SELF,PETSC_ERR_USER_INPUT,"Numerical zero pivot detected in csric02: A(%d,%d) is zero",numerical_zero,numerical_zero);
2129   }
2130 
2131   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle,
2132                     CUSPARSE_OPERATION_NON_TRANSPOSE,
2133                     &PETSC_CUSPARSE_ONE,
2134                     fs->spMatDescr_L,
2135                     fs->dnVecDescr_X,
2136                     fs->dnVecDescr_Y,
2137                     cusparse_scalartype,
2138                     CUSPARSE_SPSV_ALG_DEFAULT,
2139                     fs->spsvDescr_L,
2140                     fs->spsvBuffer_L));
2141 
2142   /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE
2143     ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F
2144   */
2145   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle,
2146                     CUSPARSE_OPERATION_TRANSPOSE,
2147                     &PETSC_CUSPARSE_ONE,
2148                     fs->spMatDescr_L,
2149                     fs->dnVecDescr_X,
2150                     fs->dnVecDescr_Y,
2151                     cusparse_scalartype,
2152                     CUSPARSE_SPSV_ALG_DEFAULT,
2153                     fs->spsvDescr_Lt,
2154                     fs->spsvBuffer_Lt));
2155 
2156   fact->offloadmask            = PETSC_OFFLOAD_GPU;
2157   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_ICC0;
2158   fact->ops->solvetranspose    = MatSolve_SeqAIJCUSPARSE_ICC0;
2159   fact->ops->matsolve          = NULL;
2160   fact->ops->matsolvetranspose = NULL;
2161   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
2162   PetscFunctionReturn(0);
2163 }
2164 
2165 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact,Mat A,IS perm,const MatFactorInfo *info)
2166 {
2167   Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors*)fact->spptr;
2168   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ*)fact->data;
2169   PetscInt                     m,nz;
2170 
2171   PetscFunctionBegin;
2172   if (PetscDefined(USE_DEBUG)) {
2173     PetscInt  i;
2174     PetscBool flg,missing;
2175 
2176     PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
2177     PetscCheck(flg,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Expected MATSEQAIJCUSPARSE, but input is %s",((PetscObject)A)->type_name);
2178     PetscCheck(A->rmap->n == A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT,A->rmap->n,A->cmap->n);
2179     PetscCall(MatMissingDiagonal(A,&missing,&i));
2180     PetscCheck(!missing,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %" PetscInt_FMT,i);
2181   }
2182 
2183   /* Free the old stale stuff */
2184   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
2185 
2186   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
2187      but they will not be used. Allocate them just for easy debugging.
2188    */
2189   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE/*malloc*/));
2190 
2191   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
2192   fact->factortype             = MAT_FACTOR_ICC;
2193   fact->info.factor_mallocs    = 0;
2194   fact->info.fill_ratio_given  = info->fill;
2195   fact->info.fill_ratio_needed = 1.0;
2196 
2197   aij->row = NULL;
2198   aij->col = NULL;
2199 
2200   /* ====================================================================== */
2201   /* Copy A's i, j to fact and also allocate the value array of fact.       */
2202   /* We'll do in-place factorization on fact                                */
2203   /* ====================================================================== */
2204   const int *Ai,*Aj;
2205 
2206   m  = fact->rmap->n;
2207   nz = aij->nz;
2208 
2209   PetscCallCUDA(cudaMalloc((void**)&fs->csrRowPtr,sizeof(int)*(m+1)));
2210   PetscCallCUDA(cudaMalloc((void**)&fs->csrColIdx,sizeof(int)*nz));
2211   PetscCallCUDA(cudaMalloc((void**)&fs->csrVal,sizeof(PetscScalar)*nz));
2212   PetscCall(MatSeqAIJCUSPARSEGetIJ(A,PETSC_FALSE,&Ai,&Aj));  /* Do not use compressed Ai */
2213   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr,Ai,sizeof(int)*(m+1),cudaMemcpyDeviceToDevice,PetscDefaultCudaStream));
2214   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx,Aj,sizeof(int)*nz,cudaMemcpyDeviceToDevice,PetscDefaultCudaStream));
2215 
2216   /* ====================================================================== */
2217   /* Create mat descriptors for M, L                                        */
2218   /* ====================================================================== */
2219   cusparseFillMode_t fillMode;
2220   cusparseDiagType_t diagType;
2221 
2222   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
2223   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
2224   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
2225 
2226   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
2227     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
2228     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
2229     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
2230     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
2231   */
2232   fillMode = CUSPARSE_FILL_MODE_LOWER;
2233   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
2234   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L,m,m,nz,
2235                     fs->csrRowPtr,
2236                     fs->csrColIdx,
2237                     fs->csrVal,
2238                     CUSPARSE_INDEX_32I,
2239                     CUSPARSE_INDEX_32I,
2240                     CUSPARSE_INDEX_BASE_ZERO,
2241                     cusparse_scalartype));
2242   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L,
2243                     CUSPARSE_SPMAT_FILL_MODE,
2244                     &fillMode,
2245                     sizeof(fillMode)));
2246   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L,
2247                     CUSPARSE_SPMAT_DIAG_TYPE,
2248                     &diagType,
2249                     sizeof(diagType)));
2250 
2251   /* ========================================================================= */
2252   /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers     */
2253   /* ========================================================================= */
2254   PetscCallCUSPARSE(cusparseCreateCsric02Info(&fs->ic0Info_M));
2255   if (m) PetscCallCUSPARSE(cusparseXcsric02_bufferSize(fs->handle, m, nz,
2256                     fs->matDescr_M,
2257                     fs->csrVal,
2258                     fs->csrRowPtr,
2259                     fs->csrColIdx,
2260                     fs->ic0Info_M,
2261                     &fs->factBufferSize_M));
2262 
2263   PetscCallCUDA(cudaMalloc((void**)&fs->X,sizeof(PetscScalar)*m));
2264   PetscCallCUDA(cudaMalloc((void**)&fs->Y,sizeof(PetscScalar)*m));
2265 
2266   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X,m,fs->X,cusparse_scalartype));
2267   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y,m,fs->Y,cusparse_scalartype));
2268 
2269   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
2270   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle,
2271                     CUSPARSE_OPERATION_NON_TRANSPOSE,
2272                     &PETSC_CUSPARSE_ONE,
2273                     fs->spMatDescr_L,
2274                     fs->dnVecDescr_X,
2275                     fs->dnVecDescr_Y,
2276                     cusparse_scalartype,
2277                     CUSPARSE_SPSV_ALG_DEFAULT,
2278                     fs->spsvDescr_L,
2279                     &fs->spsvBufferSize_L));
2280 
2281   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
2282   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle,
2283                     CUSPARSE_OPERATION_TRANSPOSE,
2284                     &PETSC_CUSPARSE_ONE,
2285                     fs->spMatDescr_L,
2286                     fs->dnVecDescr_X,
2287                     fs->dnVecDescr_Y,
2288                     cusparse_scalartype,
2289                     CUSPARSE_SPSV_ALG_DEFAULT,
2290                     fs->spsvDescr_Lt,
2291                     &fs->spsvBufferSize_Lt));
2292 
2293   /* To save device memory, we make the factorization buffer share with one of the solver buffer.
2294      See also comments in MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0().
2295    */
2296   if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) {
2297     PetscCallCUDA(cudaMalloc((void**)&fs->factBuffer_M,PetscMax(fs->spsvBufferSize_L,(size_t)fs->factBufferSize_M)));
2298     fs->spsvBuffer_L = fs->factBuffer_M;
2299     PetscCallCUDA(cudaMalloc((void**)&fs->spsvBuffer_Lt,fs->spsvBufferSize_Lt));
2300   } else {
2301     PetscCallCUDA(cudaMalloc((void**)&fs->factBuffer_M,PetscMax(fs->spsvBufferSize_Lt,(size_t)fs->factBufferSize_M)));
2302     fs->spsvBuffer_Lt = fs->factBuffer_M;
2303     PetscCallCUDA(cudaMalloc((void**)&fs->spsvBuffer_L,fs->spsvBufferSize_L));
2304   }
2305 
2306   /* ========================================================================== */
2307   /* Perform analysis of ic0 on M                                               */
2308   /* The lower triangular part of M has the same sparsity pattern as L          */
2309   /* ========================================================================== */
2310   int              structural_zero;
2311   cusparseStatus_t status;
2312 
2313   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
2314   if (m) PetscCallCUSPARSE(cusparseXcsric02_analysis(fs->handle, m, nz,
2315                     fs->matDescr_M,
2316                     fs->csrVal,
2317                     fs->csrRowPtr,
2318                     fs->csrColIdx,
2319                     fs->ic0Info_M,
2320                     fs->policy_M,
2321                     fs->factBuffer_M));
2322   if (PetscDefined(USE_DEBUG)) {
2323     /* Function cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
2324     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero);
2325     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status,PETSC_COMM_SELF,PETSC_ERR_USER_INPUT,"Structural zero pivot detected in csric02: A(%d,%d) is missing",structural_zero,structural_zero);
2326   }
2327 
2328   /* Estimate FLOPs of the numeric factorization */
2329   {
2330     Mat_SeqAIJ     *Aseq = (Mat_SeqAIJ*)A->data;
2331     PetscInt       *Ai,nzRow,nzLeft;
2332     PetscLogDouble flops = 0.0;
2333 
2334     Ai = Aseq->i;
2335     for (PetscInt i=0; i<m; i++) {
2336       nzRow = Ai[i+1] - Ai[i];
2337       if (nzRow > 1) {
2338         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
2339           and include the eliminated one will be updated, which incurs a multiplication and an addition.
2340         */
2341         nzLeft = (nzRow-1)/2;
2342         flops += nzLeft*(2.0*nzRow-nzLeft+1);
2343       }
2344     }
2345     fs->numericFactFlops = flops;
2346   }
2347   fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0;
2348   PetscFunctionReturn(0);
2349 }
2350 #endif
2351 
2352 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
2353 {
2354   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
2355 
2356   PetscFunctionBegin;
2357  #if CUSPARSE_VERSION >= 11500
2358   PetscBool row_identity = PETSC_FALSE,col_identity = PETSC_FALSE;
2359   if (cusparseTriFactors->factorizeOnDevice) {
2360     PetscCall(ISIdentity(isrow,&row_identity));
2361     PetscCall(ISIdentity(iscol,&col_identity));
2362   }
2363   if (!info->levels && row_identity && col_identity) {
2364     PetscCall(MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B,A,isrow,iscol,info));
2365   } else
2366  #endif
2367   {
2368     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2369     PetscCall(MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info));
2370     B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2371   }
2372   PetscFunctionReturn(0);
2373 }
2374 
2375 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
2376 {
2377   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
2378 
2379   PetscFunctionBegin;
2380   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2381   PetscCall(MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info));
2382   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2383   PetscFunctionReturn(0);
2384 }
2385 
2386 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
2387 {
2388   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
2389 
2390   PetscFunctionBegin;
2391  #if CUSPARSE_VERSION >= 11500
2392   PetscBool perm_identity = PETSC_FALSE;
2393   if (cusparseTriFactors->factorizeOnDevice) PetscCall(ISIdentity(perm,&perm_identity));
2394   if (!info->levels && perm_identity) {
2395     PetscCall(MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B,A,perm,info));
2396   } else
2397  #endif
2398   {
2399     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2400     PetscCall(MatICCFactorSymbolic_SeqAIJ(B,A,perm,info));
2401     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
2402   }
2403   PetscFunctionReturn(0);
2404 }
2405 
2406 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
2407 {
2408   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
2409 
2410   PetscFunctionBegin;
2411   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2412   PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info));
2413   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
2414   PetscFunctionReturn(0);
2415 }
2416 
2417 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
2418 {
2419   PetscFunctionBegin;
2420   *type = MATSOLVERCUSPARSE;
2421   PetscFunctionReturn(0);
2422 }
2423 
2424 /*MC
2425   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
2426   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
2427   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
2428   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
2429   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
2430   algorithms are not recommended. This class does NOT support direct solver operations.
2431 
2432   Level: beginner
2433 
2434 .seealso: `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
2435 M*/
2436 
2437 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
2438 {
2439   PetscInt       n = A->rmap->n;
2440   PetscBool      factOnDevice,factOnHost;
2441   char           *prefix;
2442   char           factPlace[32] = "device"; /* the default */
2443 
2444   PetscFunctionBegin;
2445   PetscCall(MatCreate(PetscObjectComm((PetscObject)A),B));
2446   PetscCall(MatSetSizes(*B,n,n,n,n));
2447   (*B)->factortype = ftype;
2448   PetscCall(MatSetType(*B,MATSEQAIJCUSPARSE));
2449 
2450   prefix = (*B)->factorprefix ? (*B)->factorprefix : ((PetscObject)A)->prefix;
2451   PetscOptionsBegin(PetscObjectComm((PetscObject)(*B)),prefix,"MatGetFactor","Mat");
2452   PetscCall(PetscOptionsString("-mat_factor_bind_factorization","Do matrix factorization on host or device when possible","MatGetFactor",NULL,factPlace,sizeof(factPlace),NULL));
2453   PetscOptionsEnd();
2454   PetscCall(PetscStrcasecmp("device",factPlace,&factOnDevice));
2455   PetscCall(PetscStrcasecmp("host",factPlace,&factOnHost));
2456   PetscCheck(factOnDevice || factOnHost,PetscObjectComm((PetscObject)(*B)),PETSC_ERR_ARG_OUTOFRANGE,"Wrong option %s to -mat_factor_bind_factorization <string>. Only host and device are allowed",factPlace);
2457   ((Mat_SeqAIJCUSPARSETriFactors*)(*B)->spptr)->factorizeOnDevice = factOnDevice;
2458 
2459   if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B,PETSC_TRUE));
2460   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
2461     PetscCall(MatSetBlockSizesFromMats(*B,A,A));
2462     if (!A->boundtocpu) {
2463       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
2464       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
2465     } else {
2466       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
2467       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJ;
2468     }
2469     PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]));
2470     PetscCall(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU]));
2471     PetscCall(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
2472   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
2473     if (!A->boundtocpu) {
2474       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
2475       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
2476     } else {
2477       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJ;
2478       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
2479     }
2480     PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
2481     PetscCall(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC]));
2482   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
2483 
2484   PetscCall(MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL));
2485   (*B)->canuseordering = PETSC_TRUE;
2486   PetscCall(PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse));
2487   PetscFunctionReturn(0);
2488 }
2489 
2490 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
2491 {
2492   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
2493   Mat_SeqAIJCUSPARSE           *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2494  #if CUSPARSE_VERSION >= 13500
2495   Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
2496  #endif
2497 
2498   PetscFunctionBegin;
2499   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
2500     PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0));
2501     if (A->factortype == MAT_FACTOR_NONE) {
2502       CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat;
2503       PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2504     }
2505    #if CUSPARSE_VERSION >= 13500
2506     else if (fs->csrVal) {
2507       /* We have a factorized matrix on device and are able to copy it to host */
2508       PetscCallCUDA(cudaMemcpy(a->a, fs->csrVal, a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2509     }
2510    #endif
2511     else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for copying this type of factorized matrix from device to host");
2512     PetscCall(PetscLogGpuToCpu(a->nz*sizeof(PetscScalar)));
2513     PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0));
2514     A->offloadmask = PETSC_OFFLOAD_BOTH;
2515   }
2516   PetscFunctionReturn(0);
2517 }
2518 
2519 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
2520 {
2521   PetscFunctionBegin;
2522   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2523   *array = ((Mat_SeqAIJ*)A->data)->a;
2524   PetscFunctionReturn(0);
2525 }
2526 
2527 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
2528 {
2529   PetscFunctionBegin;
2530   A->offloadmask = PETSC_OFFLOAD_CPU;
2531   *array         = NULL;
2532   PetscFunctionReturn(0);
2533 }
2534 
2535 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[])
2536 {
2537   PetscFunctionBegin;
2538   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2539   *array = ((Mat_SeqAIJ*)A->data)->a;
2540   PetscFunctionReturn(0);
2541 }
2542 
2543 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[])
2544 {
2545   PetscFunctionBegin;
2546   *array = NULL;
2547   PetscFunctionReturn(0);
2548 }
2549 
2550 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
2551 {
2552   PetscFunctionBegin;
2553   *array = ((Mat_SeqAIJ*)A->data)->a;
2554   PetscFunctionReturn(0);
2555 }
2556 
2557 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
2558 {
2559   PetscFunctionBegin;
2560   A->offloadmask = PETSC_OFFLOAD_CPU;
2561   *array         = NULL;
2562   PetscFunctionReturn(0);
2563 }
2564 
2565 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A,const PetscInt **i,const PetscInt **j,PetscScalar **a,PetscMemType *mtype)
2566 {
2567   Mat_SeqAIJCUSPARSE           *cusp;
2568   CsrMatrix                    *matrix;
2569 
2570   PetscFunctionBegin;
2571   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2572   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
2573   cusp = static_cast<Mat_SeqAIJCUSPARSE*>(A->spptr);
2574   PetscCheck(cusp != NULL,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"cusp is NULL");
2575   matrix = (CsrMatrix*)cusp->mat->mat;
2576 
2577   if (i) {
2578    #if !defined(PETSC_USE_64BIT_INDICES)
2579     *i = matrix->row_offsets->data().get();
2580    #else
2581     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSparse does not supported 64-bit indices");
2582    #endif
2583   }
2584   if (j) {
2585    #if !defined(PETSC_USE_64BIT_INDICES)
2586     *j = matrix->column_indices->data().get();
2587    #else
2588     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSparse does not supported 64-bit indices");
2589    #endif
2590   }
2591   if (a) *a = matrix->values->data().get();
2592   if (mtype) *mtype = PETSC_MEMTYPE_CUDA;
2593   PetscFunctionReturn(0);
2594 }
2595 
2596 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
2597 {
2598   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
2599   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
2600   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
2601   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
2602   cusparseStatus_t             stat;
2603   PetscBool                    both = PETSC_TRUE;
2604 
2605   PetscFunctionBegin;
2606   PetscCheck(!A->boundtocpu,PETSC_COMM_SELF,PETSC_ERR_GPU,"Cannot copy to GPU");
2607   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
2608     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
2609       CsrMatrix *matrix;
2610       matrix = (CsrMatrix*)cusparsestruct->mat->mat;
2611 
2612       PetscCheck(!a->nz || a->a,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR values");
2613       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0));
2614       matrix->values->assign(a->a, a->a+a->nz);
2615       PetscCallCUDA(WaitForCUDA());
2616       PetscCall(PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar)));
2617       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0));
2618       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE));
2619     } else {
2620       PetscInt nnz;
2621       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0));
2622       PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format));
2623       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE));
2624       delete cusparsestruct->workVector;
2625       delete cusparsestruct->rowoffsets_gpu;
2626       cusparsestruct->workVector = NULL;
2627       cusparsestruct->rowoffsets_gpu = NULL;
2628       try {
2629         if (a->compressedrow.use) {
2630           m    = a->compressedrow.nrows;
2631           ii   = a->compressedrow.i;
2632           ridx = a->compressedrow.rindex;
2633         } else {
2634           m    = A->rmap->n;
2635           ii   = a->i;
2636           ridx = NULL;
2637         }
2638         PetscCheck(ii,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR row data");
2639         if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; }
2640         else nnz = a->nz;
2641         PetscCheck(!nnz || a->j,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR column data");
2642 
2643         /* create cusparse matrix */
2644         cusparsestruct->nrows = m;
2645         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
2646         PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr));
2647         PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO));
2648         PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
2649 
2650         PetscCallCUDA(cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar)));
2651         PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar)));
2652         PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar)));
2653         PetscCallCUDA(cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
2654         PetscCallCUDA(cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice));
2655         PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
2656         PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE));
2657 
2658         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
2659         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
2660           /* set the matrix */
2661           CsrMatrix *mat= new CsrMatrix;
2662           mat->num_rows = m;
2663           mat->num_cols = A->cmap->n;
2664           mat->num_entries = nnz;
2665           mat->row_offsets = new THRUSTINTARRAY32(m+1);
2666           mat->row_offsets->assign(ii, ii + m+1);
2667 
2668           mat->column_indices = new THRUSTINTARRAY32(nnz);
2669           mat->column_indices->assign(a->j, a->j+nnz);
2670 
2671           mat->values = new THRUSTARRAY(nnz);
2672           if (a->a) mat->values->assign(a->a, a->a+nnz);
2673 
2674           /* assign the pointer */
2675           matstruct->mat = mat;
2676          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2677           if (mat->num_rows) { /* cusparse errors on empty matrices! */
2678             stat = cusparseCreateCsr(&matstruct->matDescr,
2679                                     mat->num_rows, mat->num_cols, mat->num_entries,
2680                                     mat->row_offsets->data().get(), mat->column_indices->data().get(),
2681                                     mat->values->data().get(),
2682                                     CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2683                                     CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);PetscCallCUSPARSE(stat);
2684           }
2685          #endif
2686         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
2687          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2688           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2689          #else
2690           CsrMatrix *mat= new CsrMatrix;
2691           mat->num_rows = m;
2692           mat->num_cols = A->cmap->n;
2693           mat->num_entries = nnz;
2694           mat->row_offsets = new THRUSTINTARRAY32(m+1);
2695           mat->row_offsets->assign(ii, ii + m+1);
2696 
2697           mat->column_indices = new THRUSTINTARRAY32(nnz);
2698           mat->column_indices->assign(a->j, a->j+nnz);
2699 
2700           mat->values = new THRUSTARRAY(nnz);
2701           if (a->a) mat->values->assign(a->a, a->a+nnz);
2702 
2703           cusparseHybMat_t hybMat;
2704           PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
2705           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
2706             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
2707           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
2708               matstruct->descr, mat->values->data().get(),
2709               mat->row_offsets->data().get(),
2710               mat->column_indices->data().get(),
2711               hybMat, 0, partition);PetscCallCUSPARSE(stat);
2712           /* assign the pointer */
2713           matstruct->mat = hybMat;
2714 
2715           if (mat) {
2716             if (mat->values) delete (THRUSTARRAY*)mat->values;
2717             if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
2718             if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
2719             delete (CsrMatrix*)mat;
2720           }
2721          #endif
2722         }
2723 
2724         /* assign the compressed row indices */
2725         if (a->compressedrow.use) {
2726           cusparsestruct->workVector = new THRUSTARRAY(m);
2727           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
2728           matstruct->cprowIndices->assign(ridx,ridx+m);
2729           tmp = m;
2730         } else {
2731           cusparsestruct->workVector = NULL;
2732           matstruct->cprowIndices    = NULL;
2733           tmp = 0;
2734         }
2735         PetscCall(PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar)));
2736 
2737         /* assign the pointer */
2738         cusparsestruct->mat = matstruct;
2739       } catch(char *ex) {
2740         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
2741       }
2742       PetscCallCUDA(WaitForCUDA());
2743       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0));
2744       cusparsestruct->nonzerostate = A->nonzerostate;
2745     }
2746     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
2747   }
2748   PetscFunctionReturn(0);
2749 }
2750 
2751 struct VecCUDAPlusEquals
2752 {
2753   template <typename Tuple>
2754   __host__ __device__
2755   void operator()(Tuple t)
2756   {
2757     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
2758   }
2759 };
2760 
2761 struct VecCUDAEquals
2762 {
2763   template <typename Tuple>
2764   __host__ __device__
2765   void operator()(Tuple t)
2766   {
2767     thrust::get<1>(t) = thrust::get<0>(t);
2768   }
2769 };
2770 
2771 struct VecCUDAEqualsReverse
2772 {
2773   template <typename Tuple>
2774   __host__ __device__
2775   void operator()(Tuple t)
2776   {
2777     thrust::get<0>(t) = thrust::get<1>(t);
2778   }
2779 };
2780 
2781 struct MatMatCusparse {
2782   PetscBool             cisdense;
2783   PetscScalar           *Bt;
2784   Mat                   X;
2785   PetscBool             reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
2786   PetscLogDouble        flops;
2787   CsrMatrix             *Bcsr;
2788 
2789 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2790   cusparseSpMatDescr_t  matSpBDescr;
2791   PetscBool             initialized;   /* C = alpha op(A) op(B) + beta C */
2792   cusparseDnMatDescr_t  matBDescr;
2793   cusparseDnMatDescr_t  matCDescr;
2794   PetscInt              Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
2795  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2796   void                  *dBuffer4;
2797   void                  *dBuffer5;
2798  #endif
2799   size_t                mmBufferSize;
2800   void                  *mmBuffer;
2801   void                  *mmBuffer2; /* SpGEMM WorkEstimation buffer */
2802   cusparseSpGEMMDescr_t spgemmDesc;
2803 #endif
2804 };
2805 
2806 static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
2807 {
2808   MatMatCusparse *mmdata = (MatMatCusparse *)data;
2809 
2810   PetscFunctionBegin;
2811   PetscCallCUDA(cudaFree(mmdata->Bt));
2812   delete mmdata->Bcsr;
2813  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2814   if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr));
2815   if (mmdata->matBDescr)   PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
2816   if (mmdata->matCDescr)   PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
2817   if (mmdata->spgemmDesc)  PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc));
2818  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2819   if (mmdata->dBuffer4)  PetscCallCUDA(cudaFree(mmdata->dBuffer4));
2820   if (mmdata->dBuffer5)  PetscCallCUDA(cudaFree(mmdata->dBuffer5));
2821  #endif
2822   if (mmdata->mmBuffer)  PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2823   if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2));
2824  #endif
2825   PetscCall(MatDestroy(&mmdata->X));
2826   PetscCall(PetscFree(data));
2827   PetscFunctionReturn(0);
2828 }
2829 
2830 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
2831 
2832 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2833 {
2834   Mat_Product                  *product = C->product;
2835   Mat                          A,B;
2836   PetscInt                     m,n,blda,clda;
2837   PetscBool                    flg,biscuda;
2838   Mat_SeqAIJCUSPARSE           *cusp;
2839   cusparseStatus_t             stat;
2840   cusparseOperation_t          opA;
2841   const PetscScalar            *barray;
2842   PetscScalar                  *carray;
2843   MatMatCusparse               *mmdata;
2844   Mat_SeqAIJCUSPARSEMultStruct *mat;
2845   CsrMatrix                    *csrmat;
2846 
2847   PetscFunctionBegin;
2848   MatCheckProduct(C,1);
2849   PetscCheck(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
2850   mmdata = (MatMatCusparse*)product->data;
2851   A    = product->A;
2852   B    = product->B;
2853   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
2854   PetscCheck(flg,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2855   /* currently CopyToGpu does not copy if the matrix is bound to CPU
2856      Instead of silently accepting the wrong answer, I prefer to raise the error */
2857   PetscCheck(!A->boundtocpu,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2858   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2859   cusp   = (Mat_SeqAIJCUSPARSE*)A->spptr;
2860   switch (product->type) {
2861   case MATPRODUCT_AB:
2862   case MATPRODUCT_PtAP:
2863     mat = cusp->mat;
2864     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2865     m   = A->rmap->n;
2866     n   = B->cmap->n;
2867     break;
2868   case MATPRODUCT_AtB:
2869     if (!A->form_explicit_transpose) {
2870       mat = cusp->mat;
2871       opA = CUSPARSE_OPERATION_TRANSPOSE;
2872     } else {
2873       PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2874       mat  = cusp->matTranspose;
2875       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
2876     }
2877     m = A->cmap->n;
2878     n = B->cmap->n;
2879     break;
2880   case MATPRODUCT_ABt:
2881   case MATPRODUCT_RARt:
2882     mat = cusp->mat;
2883     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2884     m   = A->rmap->n;
2885     n   = B->rmap->n;
2886     break;
2887   default:
2888     SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2889   }
2890   PetscCheck(mat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing Mat_SeqAIJCUSPARSEMultStruct");
2891   csrmat = (CsrMatrix*)mat->mat;
2892   /* if the user passed a CPU matrix, copy the data to the GPU */
2893   PetscCall(PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda));
2894   if (!biscuda) PetscCall(MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B));
2895   PetscCall(MatDenseCUDAGetArrayRead(B,&barray));
2896 
2897   PetscCall(MatDenseGetLDA(B,&blda));
2898   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2899     PetscCall(MatDenseCUDAGetArrayWrite(mmdata->X,&carray));
2900     PetscCall(MatDenseGetLDA(mmdata->X,&clda));
2901   } else {
2902     PetscCall(MatDenseCUDAGetArrayWrite(C,&carray));
2903     PetscCall(MatDenseGetLDA(C,&clda));
2904   }
2905 
2906   PetscCall(PetscLogGpuTimeBegin());
2907  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2908   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2909   /* (re)allocate mmBuffer if not initialized or LDAs are different */
2910   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2911     size_t mmBufferSize;
2912     if (mmdata->initialized && mmdata->Blda != blda) {PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); mmdata->matBDescr = NULL;}
2913     if (!mmdata->matBDescr) {
2914       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL));
2915       mmdata->Blda = blda;
2916     }
2917 
2918     if (mmdata->initialized && mmdata->Clda != clda) {PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); mmdata->matCDescr = NULL;}
2919     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2920       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL));
2921       mmdata->Clda = clda;
2922     }
2923 
2924     if (!mat->matDescr) {
2925       stat = cusparseCreateCsr(&mat->matDescr,
2926                                csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
2927                                csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
2928                                csrmat->values->data().get(),
2929                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2930                                CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);PetscCallCUSPARSE(stat);
2931     }
2932     stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
2933                                    mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2934                                    mmdata->matCDescr,cusparse_scalartype,
2935                                    cusp->spmmAlg,&mmBufferSize);PetscCallCUSPARSE(stat);
2936     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2937       PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2938       PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer,mmBufferSize));
2939       mmdata->mmBufferSize = mmBufferSize;
2940     }
2941     mmdata->initialized = PETSC_TRUE;
2942   } else {
2943     /* to be safe, always update pointers of the mats */
2944     PetscCallCUSPARSE(cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get()));
2945     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray));
2946     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray));
2947   }
2948 
2949   /* do cusparseSpMM, which supports transpose on B */
2950   stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
2951                       mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2952                       mmdata->matCDescr,cusparse_scalartype,
2953                       cusp->spmmAlg,mmdata->mmBuffer);PetscCallCUSPARSE(stat);
2954  #else
2955   PetscInt k;
2956   /* cusparseXcsrmm does not support transpose on B */
2957   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2958     cublasHandle_t cublasv2handle;
2959     cublasStatus_t cerr;
2960 
2961     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
2962     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
2963                        B->cmap->n,B->rmap->n,
2964                        &PETSC_CUSPARSE_ONE ,barray,blda,
2965                        &PETSC_CUSPARSE_ZERO,barray,blda,
2966                        mmdata->Bt,B->cmap->n);PetscCallCUBLAS(cerr);
2967     blda = B->cmap->n;
2968     k    = B->cmap->n;
2969   } else {
2970     k    = B->rmap->n;
2971   }
2972 
2973   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2974   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
2975                            csrmat->num_entries,mat->alpha_one,mat->descr,
2976                            csrmat->values->data().get(),
2977                            csrmat->row_offsets->data().get(),
2978                            csrmat->column_indices->data().get(),
2979                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
2980                            carray,clda);PetscCallCUSPARSE(stat);
2981  #endif
2982   PetscCall(PetscLogGpuTimeEnd());
2983   PetscCall(PetscLogGpuFlops(n*2.0*csrmat->num_entries));
2984   PetscCall(MatDenseCUDARestoreArrayRead(B,&barray));
2985   if (product->type == MATPRODUCT_RARt) {
2986     PetscCall(MatDenseCUDARestoreArrayWrite(mmdata->X,&carray));
2987     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE));
2988   } else if (product->type == MATPRODUCT_PtAP) {
2989     PetscCall(MatDenseCUDARestoreArrayWrite(mmdata->X,&carray));
2990     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE));
2991   } else {
2992     PetscCall(MatDenseCUDARestoreArrayWrite(C,&carray));
2993   }
2994   if (mmdata->cisdense) {
2995     PetscCall(MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C));
2996   }
2997   if (!biscuda) {
2998     PetscCall(MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B));
2999   }
3000   PetscFunctionReturn(0);
3001 }
3002 
3003 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
3004 {
3005   Mat_Product        *product = C->product;
3006   Mat                A,B;
3007   PetscInt           m,n;
3008   PetscBool          cisdense,flg;
3009   MatMatCusparse     *mmdata;
3010   Mat_SeqAIJCUSPARSE *cusp;
3011 
3012   PetscFunctionBegin;
3013   MatCheckProduct(C,1);
3014   PetscCheck(!C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
3015   A    = product->A;
3016   B    = product->B;
3017   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
3018   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
3019   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3020   PetscCheck(cusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
3021   switch (product->type) {
3022   case MATPRODUCT_AB:
3023     m = A->rmap->n;
3024     n = B->cmap->n;
3025     break;
3026   case MATPRODUCT_AtB:
3027     m = A->cmap->n;
3028     n = B->cmap->n;
3029     break;
3030   case MATPRODUCT_ABt:
3031     m = A->rmap->n;
3032     n = B->rmap->n;
3033     break;
3034   case MATPRODUCT_PtAP:
3035     m = B->cmap->n;
3036     n = B->cmap->n;
3037     break;
3038   case MATPRODUCT_RARt:
3039     m = B->rmap->n;
3040     n = B->rmap->n;
3041     break;
3042   default:
3043     SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
3044   }
3045   PetscCall(MatSetSizes(C,m,n,m,n));
3046   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
3047   PetscCall(PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense));
3048   PetscCall(MatSetType(C,MATSEQDENSECUDA));
3049 
3050   /* product data */
3051   PetscCall(PetscNew(&mmdata));
3052   mmdata->cisdense = cisdense;
3053  #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
3054   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
3055   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
3056     PetscCallCUDA(cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar)));
3057   }
3058  #endif
3059   /* for these products we need intermediate storage */
3060   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
3061     PetscCall(MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X));
3062     PetscCall(MatSetType(mmdata->X,MATSEQDENSECUDA));
3063     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
3064       PetscCall(MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n));
3065     } else {
3066       PetscCall(MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n));
3067     }
3068   }
3069   C->product->data    = mmdata;
3070   C->product->destroy = MatDestroy_MatMatCusparse;
3071 
3072   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
3073   PetscFunctionReturn(0);
3074 }
3075 
3076 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
3077 {
3078   Mat_Product                  *product = C->product;
3079   Mat                          A,B;
3080   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
3081   Mat_SeqAIJ                   *c = (Mat_SeqAIJ*)C->data;
3082   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
3083   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
3084   PetscBool                    flg;
3085   cusparseStatus_t             stat;
3086   MatProductType               ptype;
3087   MatMatCusparse               *mmdata;
3088 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3089   cusparseSpMatDescr_t         BmatSpDescr;
3090 #endif
3091   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
3092 
3093   PetscFunctionBegin;
3094   MatCheckProduct(C,1);
3095   PetscCheck(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
3096   PetscCall(PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg));
3097   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for C of type %s",((PetscObject)C)->type_name);
3098   mmdata = (MatMatCusparse*)C->product->data;
3099   A = product->A;
3100   B = product->B;
3101   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
3102     mmdata->reusesym = PETSC_FALSE;
3103     Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
3104     PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
3105     Cmat = Ccusp->mat;
3106     PetscCheck(Cmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]);
3107     Ccsr = (CsrMatrix*)Cmat->mat;
3108     PetscCheck(Ccsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
3109     goto finalize;
3110   }
3111   if (!c->nz) goto finalize;
3112   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
3113   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
3114   PetscCall(PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg));
3115   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
3116   PetscCheck(!A->boundtocpu,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
3117   PetscCheck(!B->boundtocpu,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
3118   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3119   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
3120   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
3121   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
3122   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
3123   PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
3124   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3125   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
3126 
3127   ptype = product->type;
3128   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
3129     ptype = MATPRODUCT_AB;
3130     PetscCheck(product->symbolic_used_the_fact_A_is_symmetric,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that A is symmetric");
3131   }
3132   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
3133     ptype = MATPRODUCT_AB;
3134     PetscCheck(product->symbolic_used_the_fact_B_is_symmetric,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that B is symmetric");
3135   }
3136   switch (ptype) {
3137   case MATPRODUCT_AB:
3138     Amat = Acusp->mat;
3139     Bmat = Bcusp->mat;
3140     break;
3141   case MATPRODUCT_AtB:
3142     Amat = Acusp->matTranspose;
3143     Bmat = Bcusp->mat;
3144     break;
3145   case MATPRODUCT_ABt:
3146     Amat = Acusp->mat;
3147     Bmat = Bcusp->matTranspose;
3148     break;
3149   default:
3150     SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
3151   }
3152   Cmat = Ccusp->mat;
3153   PetscCheck(Amat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
3154   PetscCheck(Bmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
3155   PetscCheck(Cmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[ptype]);
3156   Acsr = (CsrMatrix*)Amat->mat;
3157   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */
3158   Ccsr = (CsrMatrix*)Cmat->mat;
3159   PetscCheck(Acsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
3160   PetscCheck(Bcsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
3161   PetscCheck(Ccsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
3162   PetscCall(PetscLogGpuTimeBegin());
3163 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3164   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
3165   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3166   #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
3167     stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB,
3168                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
3169                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
3170                                mmdata->spgemmDesc);PetscCallCUSPARSE(stat);
3171   #else
3172     stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
3173                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
3174                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
3175                                mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);PetscCallCUSPARSE(stat);
3176     stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB,
3177                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
3178                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);PetscCallCUSPARSE(stat);
3179   #endif
3180 #else
3181   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB,
3182                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
3183                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
3184                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
3185                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());PetscCallCUSPARSE(stat);
3186 #endif
3187   PetscCall(PetscLogGpuFlops(mmdata->flops));
3188   PetscCallCUDA(WaitForCUDA());
3189   PetscCall(PetscLogGpuTimeEnd());
3190   C->offloadmask = PETSC_OFFLOAD_GPU;
3191 finalize:
3192   /* shorter version of MatAssemblyEnd_SeqAIJ */
3193   PetscCall(PetscInfo(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz));
3194   PetscCall(PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n"));
3195   PetscCall(PetscInfo(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax));
3196   c->reallocs         = 0;
3197   C->info.mallocs    += 0;
3198   C->info.nz_unneeded = 0;
3199   C->assembled = C->was_assembled = PETSC_TRUE;
3200   C->num_ass++;
3201   PetscFunctionReturn(0);
3202 }
3203 
3204 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
3205 {
3206   Mat_Product                  *product = C->product;
3207   Mat                          A,B;
3208   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
3209   Mat_SeqAIJ                   *a,*b,*c;
3210   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
3211   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
3212   PetscInt                     i,j,m,n,k;
3213   PetscBool                    flg;
3214   cusparseStatus_t             stat;
3215   MatProductType               ptype;
3216   MatMatCusparse               *mmdata;
3217   PetscLogDouble               flops;
3218   PetscBool                    biscompressed,ciscompressed;
3219 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3220   int64_t                      C_num_rows1, C_num_cols1, C_nnz1;
3221   cusparseSpMatDescr_t         BmatSpDescr;
3222 #else
3223   int                          cnz;
3224 #endif
3225   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
3226 
3227   PetscFunctionBegin;
3228   MatCheckProduct(C,1);
3229   PetscCheck(!C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
3230   A    = product->A;
3231   B    = product->B;
3232   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
3233   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
3234   PetscCall(PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg));
3235   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
3236   a = (Mat_SeqAIJ*)A->data;
3237   b = (Mat_SeqAIJ*)B->data;
3238   /* product data */
3239   PetscCall(PetscNew(&mmdata));
3240   C->product->data    = mmdata;
3241   C->product->destroy = MatDestroy_MatMatCusparse;
3242 
3243   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3244   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
3245   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */
3246   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
3247   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
3248   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
3249 
3250   ptype = product->type;
3251   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
3252     ptype = MATPRODUCT_AB;
3253     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
3254   }
3255   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
3256     ptype = MATPRODUCT_AB;
3257     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
3258   }
3259   biscompressed = PETSC_FALSE;
3260   ciscompressed = PETSC_FALSE;
3261   switch (ptype) {
3262   case MATPRODUCT_AB:
3263     m = A->rmap->n;
3264     n = B->cmap->n;
3265     k = A->cmap->n;
3266     Amat = Acusp->mat;
3267     Bmat = Bcusp->mat;
3268     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
3269     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
3270     break;
3271   case MATPRODUCT_AtB:
3272     m = A->cmap->n;
3273     n = B->cmap->n;
3274     k = A->rmap->n;
3275     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3276     Amat = Acusp->matTranspose;
3277     Bmat = Bcusp->mat;
3278     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
3279     break;
3280   case MATPRODUCT_ABt:
3281     m = A->rmap->n;
3282     n = B->rmap->n;
3283     k = A->cmap->n;
3284     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
3285     Amat = Acusp->mat;
3286     Bmat = Bcusp->matTranspose;
3287     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
3288     break;
3289   default:
3290     SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
3291   }
3292 
3293   /* create cusparse matrix */
3294   PetscCall(MatSetSizes(C,m,n,m,n));
3295   PetscCall(MatSetType(C,MATSEQAIJCUSPARSE));
3296   c     = (Mat_SeqAIJ*)C->data;
3297   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
3298   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
3299   Ccsr  = new CsrMatrix;
3300 
3301   c->compressedrow.use = ciscompressed;
3302   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
3303     c->compressedrow.nrows = a->compressedrow.nrows;
3304     PetscCall(PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex));
3305     PetscCall(PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows));
3306     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
3307     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
3308     Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows);
3309   } else {
3310     c->compressedrow.nrows  = 0;
3311     c->compressedrow.i      = NULL;
3312     c->compressedrow.rindex = NULL;
3313     Ccusp->workVector       = NULL;
3314     Cmat->cprowIndices      = NULL;
3315   }
3316   Ccusp->nrows    = ciscompressed ? c->compressedrow.nrows : m;
3317   Ccusp->mat      = Cmat;
3318   Ccusp->mat->mat = Ccsr;
3319   Ccsr->num_rows    = Ccusp->nrows;
3320   Ccsr->num_cols    = n;
3321   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1);
3322   PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
3323   PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
3324   PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
3325   PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar)));
3326   PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar)));
3327   PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar)));
3328   PetscCallCUDA(cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
3329   PetscCallCUDA(cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice));
3330   PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
3331   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
3332     thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0);
3333     c->nz = 0;
3334     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3335     Ccsr->values = new THRUSTARRAY(c->nz);
3336     goto finalizesym;
3337   }
3338 
3339   PetscCheck(Amat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
3340   PetscCheck(Bmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
3341   Acsr = (CsrMatrix*)Amat->mat;
3342   if (!biscompressed) {
3343     Bcsr = (CsrMatrix*)Bmat->mat;
3344 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3345     BmatSpDescr = Bmat->matDescr;
3346 #endif
3347   } else { /* we need to use row offsets for the full matrix */
3348     CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat;
3349     Bcsr = new CsrMatrix;
3350     Bcsr->num_rows       = B->rmap->n;
3351     Bcsr->num_cols       = cBcsr->num_cols;
3352     Bcsr->num_entries    = cBcsr->num_entries;
3353     Bcsr->column_indices = cBcsr->column_indices;
3354     Bcsr->values         = cBcsr->values;
3355     if (!Bcusp->rowoffsets_gpu) {
3356       Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
3357       Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
3358       PetscCall(PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt)));
3359     }
3360     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
3361     mmdata->Bcsr = Bcsr;
3362 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3363     if (Bcsr->num_rows && Bcsr->num_cols) {
3364       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries,
3365                                Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
3366                                Bcsr->values->data().get(),
3367                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
3368                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat);
3369     }
3370     BmatSpDescr = mmdata->matSpBDescr;
3371 #endif
3372   }
3373   PetscCheck(Acsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
3374   PetscCheck(Bcsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
3375   /* precompute flops count */
3376   if (ptype == MATPRODUCT_AB) {
3377     for (i=0, flops = 0; i<A->rmap->n; i++) {
3378       const PetscInt st = a->i[i];
3379       const PetscInt en = a->i[i+1];
3380       for (j=st; j<en; j++) {
3381         const PetscInt brow = a->j[j];
3382         flops += 2.*(b->i[brow+1] - b->i[brow]);
3383       }
3384     }
3385   } else if (ptype == MATPRODUCT_AtB) {
3386     for (i=0, flops = 0; i<A->rmap->n; i++) {
3387       const PetscInt anzi = a->i[i+1] - a->i[i];
3388       const PetscInt bnzi = b->i[i+1] - b->i[i];
3389       flops += (2.*anzi)*bnzi;
3390     }
3391   } else { /* TODO */
3392     flops = 0.;
3393   }
3394 
3395   mmdata->flops = flops;
3396   PetscCall(PetscLogGpuTimeBegin());
3397 
3398 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3399   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3400   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0,
3401                           NULL, NULL, NULL,
3402                           CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
3403                           CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat);
3404   PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc));
3405  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
3406  {
3407   /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
3408      We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
3409   */
3410   void*  dBuffer1 = NULL;
3411   void*  dBuffer2 = NULL;
3412   void*  dBuffer3 = NULL;
3413   /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
3414   size_t bufferSize1 = 0;
3415   size_t bufferSize2 = 0;
3416   size_t bufferSize3 = 0;
3417   size_t bufferSize4 = 0;
3418   size_t bufferSize5 = 0;
3419 
3420   /*----------------------------------------------------------------------*/
3421   /* ask bufferSize1 bytes for external memory */
3422   stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
3423                                             CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
3424                                             &bufferSize1, NULL);PetscCallCUSPARSE(stat);
3425   PetscCallCUDA(cudaMalloc((void**) &dBuffer1, bufferSize1));
3426   /* inspect the matrices A and B to understand the memory requirement for the next step */
3427   stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
3428                                             CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
3429                                             &bufferSize1, dBuffer1);PetscCallCUSPARSE(stat);
3430 
3431   /*----------------------------------------------------------------------*/
3432   stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
3433                                  CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
3434                                  &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);PetscCallCUSPARSE(stat);
3435   PetscCallCUDA(cudaMalloc((void**) &dBuffer2, bufferSize2));
3436   PetscCallCUDA(cudaMalloc((void**) &dBuffer3, bufferSize3));
3437   PetscCallCUDA(cudaMalloc((void**) &mmdata->dBuffer4, bufferSize4));
3438   stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
3439                                  CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
3440                                  &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);PetscCallCUSPARSE(stat);
3441   PetscCallCUDA(cudaFree(dBuffer1));
3442   PetscCallCUDA(cudaFree(dBuffer2));
3443 
3444   /*----------------------------------------------------------------------*/
3445   /* get matrix C non-zero entries C_nnz1 */
3446   PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
3447   c->nz = (PetscInt) C_nnz1;
3448   /* allocate matrix C */
3449   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3450   Ccsr->values         = new THRUSTARRAY(c->nz);PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3451   /* update matC with the new pointers */
3452   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
3453                                 Ccsr->values->data().get());PetscCallCUSPARSE(stat);
3454 
3455   /*----------------------------------------------------------------------*/
3456   stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
3457                                   CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
3458                                   &bufferSize5, NULL);PetscCallCUSPARSE(stat);
3459   PetscCallCUDA(cudaMalloc((void**) &mmdata->dBuffer5, bufferSize5));
3460   stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
3461                                   CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
3462                                   &bufferSize5, mmdata->dBuffer5);PetscCallCUSPARSE(stat);
3463   PetscCallCUDA(cudaFree(dBuffer3));
3464   stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB,
3465                                      Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
3466                                      cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
3467                                      mmdata->spgemmDesc);PetscCallCUSPARSE(stat);
3468   PetscCall(PetscInfo(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufferSize4/1024,bufferSize5/1024));
3469  }
3470  #else
3471   size_t bufSize2;
3472   /* ask bufferSize bytes for external memory */
3473   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB,
3474                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
3475                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
3476                                        mmdata->spgemmDesc, &bufSize2, NULL);PetscCallCUSPARSE(stat);
3477   PetscCallCUDA(cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2));
3478   /* inspect the matrices A and B to understand the memory requirement for the next step */
3479   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB,
3480                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
3481                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
3482                                        mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);PetscCallCUSPARSE(stat);
3483   /* ask bufferSize again bytes for external memory */
3484   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
3485                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
3486                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
3487                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);PetscCallCUSPARSE(stat);
3488   /* The CUSPARSE documentation is not clear, nor the API
3489      We need both buffers to perform the operations properly!
3490      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
3491      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
3492      is stored in the descriptor! What a messy API... */
3493   PetscCallCUDA(cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize));
3494   /* compute the intermediate product of A * B */
3495   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
3496                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
3497                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
3498                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);PetscCallCUSPARSE(stat);
3499   /* get matrix C non-zero entries C_nnz1 */
3500   PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
3501   c->nz = (PetscInt) C_nnz1;
3502   PetscCall(PetscInfo(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024));
3503   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3504   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3505   Ccsr->values = new THRUSTARRAY(c->nz);
3506   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3507   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
3508                                 Ccsr->values->data().get());PetscCallCUSPARSE(stat);
3509   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB,
3510                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
3511                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);PetscCallCUSPARSE(stat);
3512  #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0)
3513 #else
3514   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST));
3515   stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB,
3516                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
3517                              Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
3518                              Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
3519                              Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);PetscCallCUSPARSE(stat);
3520   c->nz = cnz;
3521   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3522   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3523   Ccsr->values = new THRUSTARRAY(c->nz);
3524   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3525 
3526   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3527   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
3528      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
3529      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
3530   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB,
3531                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
3532                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
3533                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
3534                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());PetscCallCUSPARSE(stat);
3535 #endif
3536   PetscCall(PetscLogGpuFlops(mmdata->flops));
3537   PetscCall(PetscLogGpuTimeEnd());
3538 finalizesym:
3539   c->singlemalloc = PETSC_FALSE;
3540   c->free_a       = PETSC_TRUE;
3541   c->free_ij      = PETSC_TRUE;
3542   PetscCall(PetscMalloc1(m+1,&c->i));
3543   PetscCall(PetscMalloc1(c->nz,&c->j));
3544   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
3545     PetscInt *d_i = c->i;
3546     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
3547     THRUSTINTARRAY jj(Ccsr->column_indices->size());
3548     ii   = *Ccsr->row_offsets;
3549     jj   = *Ccsr->column_indices;
3550     if (ciscompressed) d_i = c->compressedrow.i;
3551     PetscCallCUDA(cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
3552     PetscCallCUDA(cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
3553   } else {
3554     PetscInt *d_i = c->i;
3555     if (ciscompressed) d_i = c->compressedrow.i;
3556     PetscCallCUDA(cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
3557     PetscCallCUDA(cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
3558   }
3559   if (ciscompressed) { /* need to expand host row offsets */
3560     PetscInt r = 0;
3561     c->i[0] = 0;
3562     for (k = 0; k < c->compressedrow.nrows; k++) {
3563       const PetscInt next = c->compressedrow.rindex[k];
3564       const PetscInt old = c->compressedrow.i[k];
3565       for (; r < next; r++) c->i[r+1] = old;
3566     }
3567     for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows];
3568   }
3569   PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt)));
3570   PetscCall(PetscMalloc1(m,&c->ilen));
3571   PetscCall(PetscMalloc1(m,&c->imax));
3572   c->maxnz = c->nz;
3573   c->nonzerorowcnt = 0;
3574   c->rmax = 0;
3575   for (k = 0; k < m; k++) {
3576     const PetscInt nn = c->i[k+1] - c->i[k];
3577     c->ilen[k] = c->imax[k] = nn;
3578     c->nonzerorowcnt += (PetscInt)!!nn;
3579     c->rmax = PetscMax(c->rmax,nn);
3580   }
3581   PetscCall(MatMarkDiagonal_SeqAIJ(C));
3582   PetscCall(PetscMalloc1(c->nz,&c->a));
3583   Ccsr->num_entries = c->nz;
3584 
3585   C->nonzerostate++;
3586   PetscCall(PetscLayoutSetUp(C->rmap));
3587   PetscCall(PetscLayoutSetUp(C->cmap));
3588   Ccusp->nonzerostate = C->nonzerostate;
3589   C->offloadmask   = PETSC_OFFLOAD_UNALLOCATED;
3590   C->preallocated  = PETSC_TRUE;
3591   C->assembled     = PETSC_FALSE;
3592   C->was_assembled = PETSC_FALSE;
3593   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
3594     mmdata->reusesym = PETSC_TRUE;
3595     C->offloadmask   = PETSC_OFFLOAD_GPU;
3596   }
3597   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3598   PetscFunctionReturn(0);
3599 }
3600 
3601 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
3602 
3603 /* handles sparse or dense B */
3604 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
3605 {
3606   Mat_Product    *product = mat->product;
3607   PetscBool      isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE;
3608 
3609   PetscFunctionBegin;
3610   MatCheckProduct(mat,1);
3611   PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense));
3612   if (!product->A->boundtocpu && !product->B->boundtocpu) {
3613     PetscCall(PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp));
3614   }
3615   if (product->type == MATPRODUCT_ABC) {
3616     Ciscusp = PETSC_FALSE;
3617     if (!product->C->boundtocpu) {
3618       PetscCall(PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp));
3619     }
3620   }
3621   if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
3622     PetscBool usecpu = PETSC_FALSE;
3623     switch (product->type) {
3624     case MATPRODUCT_AB:
3625       if (product->api_user) {
3626         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMult","Mat");
3627         PetscCall(PetscOptionsBool("-matmatmult_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL));
3628         PetscOptionsEnd();
3629       } else {
3630         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AB","Mat");
3631         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL));
3632         PetscOptionsEnd();
3633       }
3634       break;
3635     case MATPRODUCT_AtB:
3636       if (product->api_user) {
3637         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatTransposeMatMult","Mat");
3638         PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL));
3639         PetscOptionsEnd();
3640       } else {
3641         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AtB","Mat");
3642         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL));
3643         PetscOptionsEnd();
3644       }
3645       break;
3646     case MATPRODUCT_PtAP:
3647       if (product->api_user) {
3648         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatPtAP","Mat");
3649         PetscCall(PetscOptionsBool("-matptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL));
3650         PetscOptionsEnd();
3651       } else {
3652         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_PtAP","Mat");
3653         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL));
3654         PetscOptionsEnd();
3655       }
3656       break;
3657     case MATPRODUCT_RARt:
3658       if (product->api_user) {
3659         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatRARt","Mat");
3660         PetscCall(PetscOptionsBool("-matrart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL));
3661         PetscOptionsEnd();
3662       } else {
3663         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_RARt","Mat");
3664         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL));
3665         PetscOptionsEnd();
3666       }
3667       break;
3668     case MATPRODUCT_ABC:
3669       if (product->api_user) {
3670         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMatMult","Mat");
3671         PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL));
3672         PetscOptionsEnd();
3673       } else {
3674         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_ABC","Mat");
3675         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL));
3676         PetscOptionsEnd();
3677       }
3678       break;
3679     default:
3680       break;
3681     }
3682     if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
3683   }
3684   /* dispatch */
3685   if (isdense) {
3686     switch (product->type) {
3687     case MATPRODUCT_AB:
3688     case MATPRODUCT_AtB:
3689     case MATPRODUCT_ABt:
3690     case MATPRODUCT_PtAP:
3691     case MATPRODUCT_RARt:
3692      if (product->A->boundtocpu) {
3693         PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat));
3694       } else {
3695         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
3696       }
3697       break;
3698     case MATPRODUCT_ABC:
3699       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3700       break;
3701     default:
3702       break;
3703     }
3704   } else if (Biscusp && Ciscusp) {
3705     switch (product->type) {
3706     case MATPRODUCT_AB:
3707     case MATPRODUCT_AtB:
3708     case MATPRODUCT_ABt:
3709       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3710       break;
3711     case MATPRODUCT_PtAP:
3712     case MATPRODUCT_RARt:
3713     case MATPRODUCT_ABC:
3714       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3715       break;
3716     default:
3717       break;
3718     }
3719   } else { /* fallback for AIJ */
3720     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
3721   }
3722   PetscFunctionReturn(0);
3723 }
3724 
3725 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
3726 {
3727   PetscFunctionBegin;
3728   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE));
3729   PetscFunctionReturn(0);
3730 }
3731 
3732 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
3733 {
3734   PetscFunctionBegin;
3735   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE));
3736   PetscFunctionReturn(0);
3737 }
3738 
3739 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
3740 {
3741   PetscFunctionBegin;
3742   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE));
3743   PetscFunctionReturn(0);
3744 }
3745 
3746 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
3747 {
3748   PetscFunctionBegin;
3749   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE));
3750   PetscFunctionReturn(0);
3751 }
3752 
3753 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
3754 {
3755   PetscFunctionBegin;
3756   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE));
3757   PetscFunctionReturn(0);
3758 }
3759 
3760 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y)
3761 {
3762   int i = blockIdx.x*blockDim.x + threadIdx.x;
3763   if (i < n) y[idx[i]] += x[i];
3764 }
3765 
3766 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
3767 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
3768 {
3769   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
3770   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
3771   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
3772   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
3773   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
3774   PetscBool                    compressed;
3775 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3776   PetscInt                     nx,ny;
3777 #endif
3778 
3779   PetscFunctionBegin;
3780   PetscCheck(!herm || trans,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Hermitian and not transpose not supported");
3781   if (!a->nz) {
3782     if (!yy) PetscCall(VecSet_SeqCUDA(zz,0));
3783     else PetscCall(VecCopy_SeqCUDA(yy,zz));
3784     PetscFunctionReturn(0);
3785   }
3786   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
3787   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3788   if (!trans) {
3789     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
3790     PetscCheck(matstruct,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
3791   } else {
3792     if (herm || !A->form_explicit_transpose) {
3793       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
3794       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
3795     } else {
3796       if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3797       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
3798     }
3799   }
3800   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3801   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
3802 
3803   try {
3804     PetscCall(VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray));
3805     if (yy == zz) PetscCall(VecCUDAGetArray(zz,&zarray)); /* read & write zz, so need to get uptodate zarray on GPU */
3806     else PetscCall(VecCUDAGetArrayWrite(zz,&zarray)); /* write zz, so no need to init zarray on GPU */
3807 
3808     PetscCall(PetscLogGpuTimeBegin());
3809     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3810       /* z = A x + beta y.
3811          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3812          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3813       */
3814       xptr = xarray;
3815       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
3816       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3817      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3818       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3819           allocated to accommodate different uses. So we get the length info directly from mat.
3820        */
3821       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3822         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3823         nx = mat->num_cols;
3824         ny = mat->num_rows;
3825       }
3826      #endif
3827     } else {
3828       /* z = A^T x + beta y
3829          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3830          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3831        */
3832       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
3833       dptr = zarray;
3834       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3835       if (compressed) { /* Scatter x to work vector */
3836         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3837         thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
3838                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3839                          VecCUDAEqualsReverse());
3840       }
3841      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3842       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3843         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3844         nx = mat->num_rows;
3845         ny = mat->num_cols;
3846       }
3847      #endif
3848     }
3849 
3850     /* csr_spmv does y = alpha op(A) x + beta y */
3851     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3852      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3853       PetscCheck(opA >= 0 && opA <= 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
3854       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
3855         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype));
3856         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype));
3857         PetscCallCUSPARSE(cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
3858                                                matstruct->matDescr,
3859                                                matstruct->cuSpMV[opA].vecXDescr, beta,
3860                                                matstruct->cuSpMV[opA].vecYDescr,
3861                                                cusparse_scalartype,
3862                                                cusparsestruct->spmvAlg,
3863                                                &matstruct->cuSpMV[opA].spmvBufferSize));
3864         PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize));
3865 
3866         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3867       } else {
3868         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
3869         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr));
3870         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr));
3871       }
3872 
3873       PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA,
3874                                   matstruct->alpha_one,
3875                                   matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */
3876                                   matstruct->cuSpMV[opA].vecXDescr,
3877                                   beta,
3878                                   matstruct->cuSpMV[opA].vecYDescr,
3879                                   cusparse_scalartype,
3880                                   cusparsestruct->spmvAlg,
3881                                   matstruct->cuSpMV[opA].spmvBuffer));
3882      #else
3883       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3884       PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA,
3885                                        mat->num_rows, mat->num_cols,
3886                                        mat->num_entries, matstruct->alpha_one, matstruct->descr,
3887                                        mat->values->data().get(), mat->row_offsets->data().get(),
3888                                        mat->column_indices->data().get(), xptr, beta,
3889                                        dptr));
3890      #endif
3891     } else {
3892       if (cusparsestruct->nrows) {
3893        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3894         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3895        #else
3896         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
3897         PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA,
3898                                          matstruct->alpha_one, matstruct->descr, hybMat,
3899                                          xptr, beta,
3900                                          dptr));
3901        #endif
3902       }
3903     }
3904     PetscCall(PetscLogGpuTimeEnd());
3905 
3906     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3907       if (yy) { /* MatMultAdd: zz = A*xx + yy */
3908         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3909           PetscCall(VecCopy_SeqCUDA(yy,zz)); /* zz = yy */
3910         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3911           PetscCall(VecAXPY_SeqCUDA(zz,1.0,yy)); /* zz += yy */
3912         }
3913       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3914         PetscCall(VecSet_SeqCUDA(zz,0));
3915       }
3916 
3917       /* ScatterAdd the result from work vector into the full vector when A is compressed */
3918       if (compressed) {
3919         PetscCall(PetscLogGpuTimeBegin());
3920         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred)
3921            and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
3922            prevent that. So I just add a ScatterAdd kernel.
3923          */
3924        #if 0
3925         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
3926         thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
3927                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
3928                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3929                          VecCUDAPlusEquals());
3930        #else
3931         PetscInt n = matstruct->cprowIndices->size();
3932         ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray);
3933        #endif
3934         PetscCall(PetscLogGpuTimeEnd());
3935       }
3936     } else {
3937       if (yy && yy != zz) {
3938         PetscCall(VecAXPY_SeqCUDA(zz,1.0,yy)); /* zz += yy */
3939       }
3940     }
3941     PetscCall(VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray));
3942     if (yy == zz) PetscCall(VecCUDARestoreArray(zz,&zarray));
3943     else PetscCall(VecCUDARestoreArrayWrite(zz,&zarray));
3944   } catch(char *ex) {
3945     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
3946   }
3947   if (yy) {
3948     PetscCall(PetscLogGpuFlops(2.0*a->nz));
3949   } else {
3950     PetscCall(PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt));
3951   }
3952   PetscFunctionReturn(0);
3953 }
3954 
3955 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
3956 {
3957   PetscFunctionBegin;
3958   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE));
3959   PetscFunctionReturn(0);
3960 }
3961 
3962 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
3963 {
3964   PetscObjectState   onnz = A->nonzerostate;
3965   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3966 
3967   PetscFunctionBegin;
3968   PetscCall(MatAssemblyEnd_SeqAIJ(A,mode));
3969   if (onnz != A->nonzerostate && cusp->deviceMat) {
3970 
3971     PetscCall(PetscInfo(A,"Destroy device mat since nonzerostate changed\n"));
3972     PetscCallCUDA(cudaFree(cusp->deviceMat));
3973     cusp->deviceMat = NULL;
3974   }
3975   PetscFunctionReturn(0);
3976 }
3977 
3978 /* --------------------------------------------------------------------------------*/
3979 /*@
3980    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
3981    (the default parallel PETSc format). This matrix will ultimately pushed down
3982    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
3983    assembly performance the user should preallocate the matrix storage by setting
3984    the parameter nz (or the array nnz).  By setting these parameters accurately,
3985    performance during matrix assembly can be increased by more than a factor of 50.
3986 
3987    Collective
3988 
3989    Input Parameters:
3990 +  comm - MPI communicator, set to PETSC_COMM_SELF
3991 .  m - number of rows
3992 .  n - number of columns
3993 .  nz - number of nonzeros per row (same for all rows)
3994 -  nnz - array containing the number of nonzeros in the various rows
3995          (possibly different for each row) or NULL
3996 
3997    Output Parameter:
3998 .  A - the matrix
3999 
4000    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
4001    MatXXXXSetPreallocation() paradgm instead of this routine directly.
4002    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
4003 
4004    Notes:
4005    If nnz is given then nz is ignored
4006 
4007    The AIJ format (also called the Yale sparse matrix format or
4008    compressed row storage), is fully compatible with standard Fortran 77
4009    storage.  That is, the stored row and column indices can begin at
4010    either one (as in Fortran) or zero.  See the users' manual for details.
4011 
4012    Specify the preallocated storage with either nz or nnz (not both).
4013    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
4014    allocation.  For large problems you MUST preallocate memory or you
4015    will get TERRIBLE performance, see the users' manual chapter on matrices.
4016 
4017    By default, this format uses inodes (identical nodes) when possible, to
4018    improve numerical efficiency of matrix-vector products and solves. We
4019    search for consecutive rows with the same nonzero structure, thereby
4020    reusing matrix information to achieve increased efficiency.
4021 
4022    Level: intermediate
4023 
4024 .seealso: `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATSEQAIJCUSPARSE`, `MATAIJCUSPARSE`
4025 @*/
4026 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
4027 {
4028   PetscFunctionBegin;
4029   PetscCall(MatCreate(comm,A));
4030   PetscCall(MatSetSizes(*A,m,n,m,n));
4031   PetscCall(MatSetType(*A,MATSEQAIJCUSPARSE));
4032   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz));
4033   PetscFunctionReturn(0);
4034 }
4035 
4036 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
4037 {
4038   PetscFunctionBegin;
4039   if (A->factortype == MAT_FACTOR_NONE) {
4040     PetscCall(MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr));
4041   } else {
4042     PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr));
4043   }
4044   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL));
4045   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL));
4046   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetUseCPUSolve_C",NULL));
4047   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL));
4048   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL));
4049   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL));
4050   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL));
4051   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL));
4052   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL));
4053   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaijcusparse_hypre_C",NULL));
4054   PetscCall(MatDestroy_SeqAIJ(A));
4055   PetscFunctionReturn(0);
4056 }
4057 
4058 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
4059 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
4060 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
4061 {
4062   PetscFunctionBegin;
4063   PetscCall(MatDuplicate_SeqAIJ(A,cpvalues,B));
4064   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B));
4065   PetscFunctionReturn(0);
4066 }
4067 
4068 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str)
4069 {
4070   Mat_SeqAIJ         *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
4071   Mat_SeqAIJCUSPARSE *cy;
4072   Mat_SeqAIJCUSPARSE *cx;
4073   PetscScalar        *ay;
4074   const PetscScalar  *ax;
4075   CsrMatrix          *csry,*csrx;
4076 
4077   PetscFunctionBegin;
4078   cy = (Mat_SeqAIJCUSPARSE*)Y->spptr;
4079   cx = (Mat_SeqAIJCUSPARSE*)X->spptr;
4080   if (X->ops->axpy != Y->ops->axpy) {
4081     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE));
4082     PetscCall(MatAXPY_SeqAIJ(Y,a,X,str));
4083     PetscFunctionReturn(0);
4084   }
4085   /* if we are here, it means both matrices are bound to GPU */
4086   PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y));
4087   PetscCall(MatSeqAIJCUSPARSECopyToGPU(X));
4088   PetscCheck(cy->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)Y),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
4089   PetscCheck(cx->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)X),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
4090   csry = (CsrMatrix*)cy->mat->mat;
4091   csrx = (CsrMatrix*)cx->mat->mat;
4092   /* see if we can turn this into a cublas axpy */
4093   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
4094     bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin());
4095     if (eq) {
4096       eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin());
4097     }
4098     if (eq) str = SAME_NONZERO_PATTERN;
4099   }
4100   /* spgeam is buggy with one column */
4101   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
4102 
4103   if (str == SUBSET_NONZERO_PATTERN) {
4104     PetscScalar b = 1.0;
4105 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4106     size_t      bufferSize;
4107     void        *buffer;
4108 #endif
4109 
4110     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X,&ax));
4111     PetscCall(MatSeqAIJCUSPARSEGetArray(Y,&ay));
4112     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST));
4113 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4114     PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n,
4115                                                   &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
4116                                                   &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
4117                                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize));
4118     PetscCallCUDA(cudaMalloc(&buffer,bufferSize));
4119     PetscCall(PetscLogGpuTimeBegin());
4120     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
4121                                        &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
4122                                        &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
4123                                        cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer));
4124     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
4125     PetscCall(PetscLogGpuTimeEnd());
4126     PetscCallCUDA(cudaFree(buffer));
4127 #else
4128     PetscCall(PetscLogGpuTimeBegin());
4129     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
4130                                        &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
4131                                        &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
4132                                        cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get()));
4133     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
4134     PetscCall(PetscLogGpuTimeEnd());
4135 #endif
4136     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE));
4137     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X,&ax));
4138     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y,&ay));
4139     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
4140   } else if (str == SAME_NONZERO_PATTERN) {
4141     cublasHandle_t cublasv2handle;
4142     PetscBLASInt   one = 1, bnz = 1;
4143 
4144     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X,&ax));
4145     PetscCall(MatSeqAIJCUSPARSEGetArray(Y,&ay));
4146     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
4147     PetscCall(PetscBLASIntCast(x->nz,&bnz));
4148     PetscCall(PetscLogGpuTimeBegin());
4149     PetscCallCUBLAS(cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one));
4150     PetscCall(PetscLogGpuFlops(2.0*bnz));
4151     PetscCall(PetscLogGpuTimeEnd());
4152     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X,&ax));
4153     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y,&ay));
4154     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
4155   } else {
4156     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE));
4157     PetscCall(MatAXPY_SeqAIJ(Y,a,X,str));
4158   }
4159   PetscFunctionReturn(0);
4160 }
4161 
4162 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a)
4163 {
4164   Mat_SeqAIJ     *y = (Mat_SeqAIJ*)Y->data;
4165   PetscScalar    *ay;
4166   cublasHandle_t cublasv2handle;
4167   PetscBLASInt   one = 1, bnz = 1;
4168 
4169   PetscFunctionBegin;
4170   PetscCall(MatSeqAIJCUSPARSEGetArray(Y,&ay));
4171   PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
4172   PetscCall(PetscBLASIntCast(y->nz,&bnz));
4173   PetscCall(PetscLogGpuTimeBegin());
4174   PetscCallCUBLAS(cublasXscal(cublasv2handle,bnz,&a,ay,one));
4175   PetscCall(PetscLogGpuFlops(bnz));
4176   PetscCall(PetscLogGpuTimeEnd());
4177   PetscCall(MatSeqAIJCUSPARSERestoreArray(Y,&ay));
4178   PetscCall(MatSeqAIJInvalidateDiagonal(Y));
4179   PetscFunctionReturn(0);
4180 }
4181 
4182 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
4183 {
4184   PetscBool      both = PETSC_FALSE;
4185   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
4186 
4187   PetscFunctionBegin;
4188   if (A->factortype == MAT_FACTOR_NONE) {
4189     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
4190     if (spptr->mat) {
4191       CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat;
4192       if (matrix->values) {
4193         both = PETSC_TRUE;
4194         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
4195       }
4196     }
4197     if (spptr->matTranspose) {
4198       CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat;
4199       if (matrix->values) {
4200         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
4201       }
4202     }
4203   }
4204   PetscCall(PetscArrayzero(a->a,a->i[A->rmap->n]));
4205   PetscCall(MatSeqAIJInvalidateDiagonal(A));
4206   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
4207   else A->offloadmask = PETSC_OFFLOAD_CPU;
4208   PetscFunctionReturn(0);
4209 }
4210 
4211 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
4212 {
4213   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
4214 
4215   PetscFunctionBegin;
4216   if (A->factortype != MAT_FACTOR_NONE) {
4217     A->boundtocpu = flg;
4218     PetscFunctionReturn(0);
4219   }
4220   if (flg) {
4221     PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
4222 
4223     A->ops->scale                     = MatScale_SeqAIJ;
4224     A->ops->axpy                      = MatAXPY_SeqAIJ;
4225     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
4226     A->ops->mult                      = MatMult_SeqAIJ;
4227     A->ops->multadd                   = MatMultAdd_SeqAIJ;
4228     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
4229     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
4230     A->ops->multhermitiantranspose    = NULL;
4231     A->ops->multhermitiantransposeadd = NULL;
4232     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
4233     PetscCall(PetscMemzero(a->ops,sizeof(Mat_SeqAIJOps)));
4234     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL));
4235     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL));
4236     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL));
4237     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL));
4238     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL));
4239     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL));
4240   } else {
4241     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
4242     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
4243     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
4244     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
4245     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
4246     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
4247     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
4248     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
4249     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
4250     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
4251     a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJCUSPARSE;
4252     a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJCUSPARSE;
4253     a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE;
4254     a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE;
4255     a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE;
4256     a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE;
4257     a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE;
4258 
4259     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE));
4260     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE));
4261     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE));
4262     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE));
4263     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE));
4264     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE));
4265    }
4266   A->boundtocpu = flg;
4267   if (flg && a->inode.size) {
4268     a->inode.use = PETSC_TRUE;
4269   } else {
4270     a->inode.use = PETSC_FALSE;
4271   }
4272   PetscFunctionReturn(0);
4273 }
4274 
4275 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
4276 {
4277   Mat              B;
4278 
4279   PetscFunctionBegin;
4280   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */
4281   if (reuse == MAT_INITIAL_MATRIX) {
4282     PetscCall(MatDuplicate(A,MAT_COPY_VALUES,newmat));
4283   } else if (reuse == MAT_REUSE_MATRIX) {
4284     PetscCall(MatCopy(A,*newmat,SAME_NONZERO_PATTERN));
4285   }
4286   B = *newmat;
4287 
4288   PetscCall(PetscFree(B->defaultvectype));
4289   PetscCall(PetscStrallocpy(VECCUDA,&B->defaultvectype));
4290 
4291   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
4292     if (B->factortype == MAT_FACTOR_NONE) {
4293       Mat_SeqAIJCUSPARSE *spptr;
4294       PetscCall(PetscNew(&spptr));
4295       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
4296       PetscCallCUSPARSE(cusparseSetStream(spptr->handle,PetscDefaultCudaStream));
4297       spptr->format     = MAT_CUSPARSE_CSR;
4298      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4299      #if CUSPARSE_VERSION > 11301
4300       spptr->spmvAlg    = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
4301      #else
4302       spptr->spmvAlg    = CUSPARSE_CSRMV_ALG1;    /* default, since we only support csr */
4303      #endif
4304       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
4305       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
4306      #endif
4307       B->spptr = spptr;
4308     } else {
4309       Mat_SeqAIJCUSPARSETriFactors *spptr;
4310 
4311       PetscCall(PetscNew(&spptr));
4312       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
4313       PetscCallCUSPARSE(cusparseSetStream(spptr->handle,PetscDefaultCudaStream));
4314       B->spptr = spptr;
4315     }
4316     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
4317   }
4318   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
4319   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
4320   B->ops->setoption      = MatSetOption_SeqAIJCUSPARSE;
4321   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
4322   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
4323   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
4324 
4325   PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE));
4326   PetscCall(PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE));
4327   PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE));
4328 #if defined(PETSC_HAVE_HYPRE)
4329   PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijcusparse_hypre_C",MatConvert_AIJ_HYPRE));
4330 #endif
4331   PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetUseCPUSolve_C",MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE));
4332   PetscFunctionReturn(0);
4333 }
4334 
4335 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
4336 {
4337   PetscFunctionBegin;
4338   PetscCall(MatCreate_SeqAIJ(B));
4339   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B));
4340   PetscFunctionReturn(0);
4341 }
4342 
4343 /*MC
4344    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
4345 
4346    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
4347    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
4348    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
4349 
4350    Options Database Keys:
4351 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
4352 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
4353 -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
4354 +  -mat_cusparse_use_cpu_solve - Do MatSolve on CPU
4355 
4356   Level: beginner
4357 
4358 .seealso: `MatCreateSeqAIJCUSPARSE()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
4359 M*/
4360 
4361 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*);
4362 
4363 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
4364 {
4365   PetscFunctionBegin;
4366   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSEBAND,MATSEQAIJ,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band));
4367   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse));
4368   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse));
4369   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse));
4370   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse));
4371 
4372   PetscFunctionReturn(0);
4373 }
4374 
4375 static PetscErrorCode MatResetPreallocationCOO_SeqAIJCUSPARSE(Mat mat)
4376 {
4377   Mat_SeqAIJCUSPARSE* cusp = (Mat_SeqAIJCUSPARSE*)mat->spptr;
4378 
4379   PetscFunctionBegin;
4380   if (!cusp) PetscFunctionReturn(0);
4381   delete cusp->cooPerm;
4382   delete cusp->cooPerm_a;
4383   cusp->cooPerm = NULL;
4384   cusp->cooPerm_a = NULL;
4385   if (cusp->use_extended_coo) {
4386     PetscCallCUDA(cudaFree(cusp->jmap_d));
4387     PetscCallCUDA(cudaFree(cusp->perm_d));
4388   }
4389   cusp->use_extended_coo = PETSC_FALSE;
4390   PetscFunctionReturn(0);
4391 }
4392 
4393 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
4394 {
4395   PetscFunctionBegin;
4396   if (*cusparsestruct) {
4397     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format));
4398     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format));
4399     delete (*cusparsestruct)->workVector;
4400     delete (*cusparsestruct)->rowoffsets_gpu;
4401     delete (*cusparsestruct)->cooPerm;
4402     delete (*cusparsestruct)->cooPerm_a;
4403     delete (*cusparsestruct)->csr2csc_i;
4404     if ((*cusparsestruct)->handle) PetscCallCUSPARSE(cusparseDestroy((*cusparsestruct)->handle));
4405     if ((*cusparsestruct)->jmap_d) PetscCallCUDA(cudaFree((*cusparsestruct)->jmap_d));
4406     if ((*cusparsestruct)->perm_d) PetscCallCUDA(cudaFree((*cusparsestruct)->perm_d));
4407     PetscCall(PetscFree(*cusparsestruct));
4408   }
4409   PetscFunctionReturn(0);
4410 }
4411 
4412 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
4413 {
4414   PetscFunctionBegin;
4415   if (*mat) {
4416     delete (*mat)->values;
4417     delete (*mat)->column_indices;
4418     delete (*mat)->row_offsets;
4419     delete *mat;
4420     *mat = 0;
4421   }
4422   PetscFunctionReturn(0);
4423 }
4424 
4425 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
4426 {
4427   PetscFunctionBegin;
4428   if (*trifactor) {
4429     if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr));
4430     if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparseDestroyCsrsvInfo((*trifactor)->solveInfo));
4431     PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat));
4432     if ((*trifactor)->solveBuffer)   PetscCallCUDA(cudaFree((*trifactor)->solveBuffer));
4433     if ((*trifactor)->AA_h)   PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h));
4434    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4435     if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer));
4436    #endif
4437     PetscCall(PetscFree(*trifactor));
4438   }
4439   PetscFunctionReturn(0);
4440 }
4441 
4442 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
4443 {
4444   CsrMatrix        *mat;
4445 
4446   PetscFunctionBegin;
4447   if (*matstruct) {
4448     if ((*matstruct)->mat) {
4449       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
4450        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4451         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
4452        #else
4453         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
4454         PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat));
4455        #endif
4456       } else {
4457         mat = (CsrMatrix*)(*matstruct)->mat;
4458         CsrMatrix_Destroy(&mat);
4459       }
4460     }
4461     if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr));
4462     delete (*matstruct)->cprowIndices;
4463     if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one));
4464     if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero));
4465     if ((*matstruct)->beta_one)  PetscCallCUDA(cudaFree((*matstruct)->beta_one));
4466 
4467    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4468     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
4469     if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr));
4470     for (int i=0; i<3; i++) {
4471       if (mdata->cuSpMV[i].initialized) {
4472         PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer));
4473         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr));
4474         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr));
4475       }
4476     }
4477    #endif
4478     delete *matstruct;
4479     *matstruct = NULL;
4480   }
4481   PetscFunctionReturn(0);
4482 }
4483 
4484 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p* trifactors)
4485 {
4486   Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors;
4487 
4488   PetscFunctionBegin;
4489   if (fs) {
4490     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtr));
4491     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtr));
4492     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose));
4493     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose));
4494     delete fs->rpermIndices;
4495     delete fs->cpermIndices;
4496     delete fs->workVector;
4497     fs->rpermIndices = NULL;
4498     fs->cpermIndices = NULL;
4499     fs->workVector = NULL;
4500     if (fs->a_band_d)   PetscCallCUDA(cudaFree(fs->a_band_d));
4501     if (fs->i_band_d)   PetscCallCUDA(cudaFree(fs->i_band_d));
4502     fs->init_dev_prop = PETSC_FALSE;
4503    #if CUSPARSE_VERSION >= 11500
4504     PetscCallCUDA(cudaFree(fs->csrRowPtr));
4505     PetscCallCUDA(cudaFree(fs->csrColIdx));
4506     PetscCallCUDA(cudaFree(fs->csrVal));
4507     PetscCallCUDA(cudaFree(fs->X));
4508     PetscCallCUDA(cudaFree(fs->Y));
4509     // PetscCallCUDA(cudaFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */
4510     PetscCallCUDA(cudaFree(fs->spsvBuffer_L));
4511     PetscCallCUDA(cudaFree(fs->spsvBuffer_U));
4512     PetscCallCUDA(cudaFree(fs->spsvBuffer_Lt));
4513     PetscCallCUDA(cudaFree(fs->spsvBuffer_Ut));
4514     PetscCallCUSPARSE(cusparseDestroyMatDescr(fs->matDescr_M));
4515     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_L));
4516     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_U));
4517     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_L));
4518     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Lt));
4519     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_U));
4520     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Ut));
4521     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_X));
4522     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_Y));
4523     PetscCallCUSPARSE(cusparseDestroyCsrilu02Info(fs->ilu0Info_M));
4524     PetscCallCUSPARSE(cusparseDestroyCsric02Info(fs->ic0Info_M));
4525 
4526     fs->createdTransposeSpSVDescr    = PETSC_FALSE;
4527     fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
4528    #endif
4529   }
4530   PetscFunctionReturn(0);
4531 }
4532 
4533 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
4534 {
4535   cusparseHandle_t handle;
4536 
4537   PetscFunctionBegin;
4538   if (*trifactors) {
4539     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors));
4540     if (handle = (*trifactors)->handle) {
4541       PetscCallCUSPARSE(cusparseDestroy(handle));
4542     }
4543     PetscCall(PetscFree(*trifactors));
4544   }
4545   PetscFunctionReturn(0);
4546 }
4547 
4548 struct IJCompare
4549 {
4550   __host__ __device__
4551   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
4552   {
4553     if (t1.get<0>() < t2.get<0>()) return true;
4554     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
4555     return false;
4556   }
4557 };
4558 
4559 struct IJEqual
4560 {
4561   __host__ __device__
4562   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
4563   {
4564     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
4565     return true;
4566   }
4567 };
4568 
4569 struct IJDiff
4570 {
4571   __host__ __device__
4572   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
4573   {
4574     return t1 == t2 ? 0 : 1;
4575   }
4576 };
4577 
4578 struct IJSum
4579 {
4580   __host__ __device__
4581   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
4582   {
4583     return t1||t2;
4584   }
4585 };
4586 
4587 #include <thrust/iterator/discard_iterator.h>
4588 /* Associated with MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic() */
4589 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode)
4590 {
4591   Mat_SeqAIJCUSPARSE                    *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4592   Mat_SeqAIJ                            *a = (Mat_SeqAIJ*)A->data;
4593   THRUSTARRAY                           *cooPerm_v = NULL;
4594   thrust::device_ptr<const PetscScalar> d_v;
4595   CsrMatrix                             *matrix;
4596   PetscInt                              n;
4597 
4598   PetscFunctionBegin;
4599   PetscCheck(cusp,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct");
4600   PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix");
4601   if (!cusp->cooPerm) {
4602     PetscCall(MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY));
4603     PetscCall(MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY));
4604     PetscFunctionReturn(0);
4605   }
4606   matrix = (CsrMatrix*)cusp->mat->mat;
4607   PetscCheck(matrix->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4608   if (!v) {
4609     if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
4610     goto finalize;
4611   }
4612   n = cusp->cooPerm->size();
4613   if (isCudaMem(v)) {
4614     d_v = thrust::device_pointer_cast(v);
4615   } else {
4616     cooPerm_v = new THRUSTARRAY(n);
4617     cooPerm_v->assign(v,v+n);
4618     d_v = cooPerm_v->data();
4619     PetscCall(PetscLogCpuToGpu(n*sizeof(PetscScalar)));
4620   }
4621   PetscCall(PetscLogGpuTimeBegin());
4622   if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
4623     if (cusp->cooPerm_a) { /* there are repeated entries in d_v[], and we need to add these them */
4624       THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
4625       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
4626       /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output)
4627         cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[].
4628         cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero.
4629       */
4630       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
4631       thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>());
4632       delete cooPerm_w;
4633     } else {
4634       /* all nonzeros in d_v[] are unique entries */
4635       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
4636                                                                 matrix->values->begin()));
4637       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
4638                                                                 matrix->values->end()));
4639       thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]]  */
4640     }
4641   } else {
4642     if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
4643       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
4644       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
4645     } else {
4646       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
4647                                                                 matrix->values->begin()));
4648       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
4649                                                                 matrix->values->end()));
4650       thrust::for_each(zibit,zieit,VecCUDAEquals());
4651     }
4652   }
4653   PetscCall(PetscLogGpuTimeEnd());
4654 finalize:
4655   delete cooPerm_v;
4656   A->offloadmask = PETSC_OFFLOAD_GPU;
4657   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4658   /* shorter version of MatAssemblyEnd_SeqAIJ */
4659   PetscCall(PetscInfo(A,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",A->rmap->n,A->cmap->n,a->nz));
4660   PetscCall(PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n"));
4661   PetscCall(PetscInfo(A,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",a->rmax));
4662   a->reallocs         = 0;
4663   A->info.mallocs    += 0;
4664   A->info.nz_unneeded = 0;
4665   A->assembled = A->was_assembled = PETSC_TRUE;
4666   A->num_ass++;
4667   PetscFunctionReturn(0);
4668 }
4669 
4670 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
4671 {
4672   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4673 
4674   PetscFunctionBegin;
4675   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4676   if (!cusp) PetscFunctionReturn(0);
4677   if (destroy) {
4678     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format));
4679     delete cusp->csr2csc_i;
4680     cusp->csr2csc_i = NULL;
4681   }
4682   A->transupdated = PETSC_FALSE;
4683   PetscFunctionReturn(0);
4684 }
4685 
4686 #include <thrust/binary_search.h>
4687 /* 'Basic' means it only works when coo_i[] and coo_j[] do not contain negative indices */
4688 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(Mat A, PetscCount n, PetscInt coo_i[], PetscInt coo_j[])
4689 {
4690   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4691   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
4692   PetscInt           cooPerm_n, nzr = 0;
4693 
4694   PetscFunctionBegin;
4695   PetscCall(PetscLayoutSetUp(A->rmap));
4696   PetscCall(PetscLayoutSetUp(A->cmap));
4697   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
4698   if (n != cooPerm_n) {
4699     delete cusp->cooPerm;
4700     delete cusp->cooPerm_a;
4701     cusp->cooPerm = NULL;
4702     cusp->cooPerm_a = NULL;
4703   }
4704   if (n) {
4705     thrust::device_ptr<PetscInt> d_i,d_j;
4706     PetscInt                     *d_raw_i,*d_raw_j;
4707     PetscBool                    free_raw_i = PETSC_FALSE,free_raw_j = PETSC_FALSE;
4708     PetscMemType                 imtype,jmtype;
4709 
4710     PetscCall(PetscGetMemType(coo_i,&imtype));
4711     if (PetscMemTypeHost(imtype)) {
4712       PetscCallCUDA(cudaMalloc(&d_raw_i,sizeof(PetscInt)*n));
4713       PetscCallCUDA(cudaMemcpy(d_raw_i,coo_i,sizeof(PetscInt)*n,cudaMemcpyHostToDevice));
4714       d_i = thrust::device_pointer_cast(d_raw_i);
4715       free_raw_i = PETSC_TRUE;
4716       PetscCall(PetscLogCpuToGpu(1.*n*sizeof(PetscInt)));
4717     } else {
4718       d_i = thrust::device_pointer_cast(coo_i);
4719     }
4720 
4721     PetscCall(PetscGetMemType(coo_j,&jmtype));
4722     if (PetscMemTypeHost(jmtype)) { // MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic() passes device coo_i[] and host coo_j[]!
4723       PetscCallCUDA(cudaMalloc(&d_raw_j,sizeof(PetscInt)*n));
4724       PetscCallCUDA(cudaMemcpy(d_raw_j,coo_j,sizeof(PetscInt)*n,cudaMemcpyHostToDevice));
4725       d_j = thrust::device_pointer_cast(d_raw_j);
4726       free_raw_j = PETSC_TRUE;
4727       PetscCall(PetscLogCpuToGpu(1.*n*sizeof(PetscInt)));
4728     } else {
4729       d_j = thrust::device_pointer_cast(coo_j);
4730     }
4731 
4732     THRUSTINTARRAY ii(A->rmap->n);
4733 
4734     if (!cusp->cooPerm)   { cusp->cooPerm   = new THRUSTINTARRAY(n); }
4735     if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); }
4736 
4737     /* Ex.
4738       n = 6
4739       coo_i = [3,3,1,4,1,4]
4740       coo_j = [3,2,2,5,2,6]
4741     */
4742     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i,d_j));
4743     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i+n,d_j+n));
4744 
4745     PetscCall(PetscLogGpuTimeBegin());
4746     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
4747     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */
4748     (*cusp->cooPerm_a).assign(d_i,d_i+n); /* copy the sorted array */
4749     THRUSTINTARRAY w(d_j,d_j+n);
4750 
4751     /*
4752       d_i     = [1,1,3,3,4,4]
4753       d_j     = [2,2,2,3,5,6]
4754       cooPerm = [2,4,1,0,3,5]
4755     */
4756     auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */
4757 
4758     /*
4759       d_i     = [1,3,3,4,4,x]
4760                             ^ekey
4761       d_j     = [2,2,3,5,6,x]
4762                            ^nekye
4763     */
4764     if (nekey == ekey) { /* all entries are unique */
4765       delete cusp->cooPerm_a;
4766       cusp->cooPerm_a = NULL;
4767     } else { /* Stefano: I couldn't come up with a more elegant algorithm */
4768       /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */
4769       adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/
4770       adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff());                                              /* w:         [2,2,2,3,5,6] => [2,0,0,1,1,1]*/
4771       (*cusp->cooPerm_a)[0] = 0; /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */
4772       w[0] = 0;
4773       thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); /* cooPerm_a =          [0,0,1,1,1,1]*/
4774       thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/
4775     }
4776     thrust::counting_iterator<PetscInt> search_begin(0);
4777     thrust::upper_bound(d_i, nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */
4778                         search_begin, search_begin + A->rmap->n,  /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */
4779                         ii.begin()); /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */
4780     PetscCall(PetscLogGpuTimeEnd());
4781 
4782     PetscCall(MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i));
4783     a->singlemalloc = PETSC_FALSE;
4784     a->free_a       = PETSC_TRUE;
4785     a->free_ij      = PETSC_TRUE;
4786     PetscCall(PetscMalloc1(A->rmap->n+1,&a->i));
4787     a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */
4788     PetscCallCUDA(cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost));
4789     a->nz = a->maxnz = a->i[A->rmap->n];
4790     a->rmax = 0;
4791     PetscCall(PetscMalloc1(a->nz,&a->a));
4792     PetscCall(PetscMalloc1(a->nz,&a->j));
4793     PetscCallCUDA(cudaMemcpy(a->j,thrust::raw_pointer_cast(d_j),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost));
4794     if (!a->ilen) PetscCall(PetscMalloc1(A->rmap->n,&a->ilen));
4795     if (!a->imax) PetscCall(PetscMalloc1(A->rmap->n,&a->imax));
4796     for (PetscInt i = 0; i < A->rmap->n; i++) {
4797       const PetscInt nnzr = a->i[i+1] - a->i[i];
4798       nzr += (PetscInt)!!(nnzr);
4799       a->ilen[i] = a->imax[i] = nnzr;
4800       a->rmax = PetscMax(a->rmax,nnzr);
4801     }
4802     a->nonzerorowcnt = nzr;
4803     A->preallocated = PETSC_TRUE;
4804     PetscCall(PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt)));
4805     PetscCall(MatMarkDiagonal_SeqAIJ(A));
4806     if (free_raw_i) PetscCallCUDA(cudaFree(d_raw_i));
4807     if (free_raw_j) PetscCallCUDA(cudaFree(d_raw_j));
4808   } else {
4809     PetscCall(MatSeqAIJSetPreallocation(A,0,NULL));
4810   }
4811   PetscCall(MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE));
4812 
4813   /* We want to allocate the CUSPARSE struct for matvec now.
4814      The code is so convoluted now that I prefer to copy zeros */
4815   PetscCall(PetscArrayzero(a->a,a->nz));
4816   PetscCall(MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6));
4817   A->offloadmask = PETSC_OFFLOAD_CPU;
4818   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4819   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE));
4820   PetscFunctionReturn(0);
4821 }
4822 
4823 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
4824 {
4825   Mat_SeqAIJ         *seq;
4826   Mat_SeqAIJCUSPARSE *dev;
4827   PetscBool          coo_basic = PETSC_TRUE;
4828   PetscMemType       mtype = PETSC_MEMTYPE_DEVICE;
4829 
4830   PetscFunctionBegin;
4831   PetscCall(MatResetPreallocationCOO_SeqAIJ(mat));
4832   PetscCall(MatResetPreallocationCOO_SeqAIJCUSPARSE(mat));
4833   if (coo_i) {
4834     PetscCall(PetscGetMemType(coo_i,&mtype));
4835     if (PetscMemTypeHost(mtype)) {
4836       for (PetscCount k=0; k<coo_n; k++) {
4837         if (coo_i[k] < 0 || coo_j[k] < 0) {coo_basic = PETSC_FALSE; break;}
4838       }
4839     }
4840   }
4841 
4842   if (coo_basic) { /* i,j are on device or do not contain negative indices */
4843     PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(mat,coo_n,coo_i,coo_j));
4844   } else {
4845     PetscCall(MatSetPreallocationCOO_SeqAIJ(mat,coo_n,coo_i,coo_j));
4846     mat->offloadmask = PETSC_OFFLOAD_CPU;
4847     PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat));
4848     seq  = static_cast<Mat_SeqAIJ*>(mat->data);
4849     dev  = static_cast<Mat_SeqAIJCUSPARSE*>(mat->spptr);
4850     PetscCallCUDA(cudaMalloc((void**)&dev->jmap_d,(seq->nz+1)*sizeof(PetscCount)));
4851     PetscCallCUDA(cudaMemcpy(dev->jmap_d,seq->jmap,(seq->nz+1)*sizeof(PetscCount),cudaMemcpyHostToDevice));
4852     PetscCallCUDA(cudaMalloc((void**)&dev->perm_d,seq->Atot*sizeof(PetscCount)));
4853     PetscCallCUDA(cudaMemcpy(dev->perm_d,seq->perm,seq->Atot*sizeof(PetscCount),cudaMemcpyHostToDevice));
4854     dev->use_extended_coo = PETSC_TRUE;
4855   }
4856   PetscFunctionReturn(0);
4857 }
4858 
4859 __global__ static void MatAddCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount jmap[],const PetscCount perm[],InsertMode imode,PetscScalar a[])
4860 {
4861   PetscCount        i = blockIdx.x*blockDim.x + threadIdx.x;
4862   const PetscCount  grid_size = gridDim.x * blockDim.x;
4863   for (; i<nnz; i+= grid_size) {
4864     PetscScalar sum = 0.0;
4865     for (PetscCount k=jmap[i]; k<jmap[i+1]; k++) sum += kv[perm[k]];
4866     a[i] = (imode == INSERT_VALUES? 0.0 : a[i]) + sum;
4867   }
4868 }
4869 
4870 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
4871 {
4872   Mat_SeqAIJ          *seq = (Mat_SeqAIJ*)A->data;
4873   Mat_SeqAIJCUSPARSE  *dev = (Mat_SeqAIJCUSPARSE*)A->spptr;
4874   PetscCount          Annz = seq->nz;
4875   PetscMemType        memtype;
4876   const PetscScalar   *v1 = v;
4877   PetscScalar         *Aa;
4878 
4879   PetscFunctionBegin;
4880   if (dev->use_extended_coo) {
4881     PetscCall(PetscGetMemType(v,&memtype));
4882     if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
4883       PetscCallCUDA(cudaMalloc((void**)&v1,seq->coo_n*sizeof(PetscScalar)));
4884       PetscCallCUDA(cudaMemcpy((void*)v1,v,seq->coo_n*sizeof(PetscScalar),cudaMemcpyHostToDevice));
4885     }
4886 
4887     if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A,&Aa));
4888     else PetscCall(MatSeqAIJCUSPARSEGetArray(A,&Aa));
4889 
4890     if (Annz) {
4891       MatAddCOOValues<<<(Annz+255)/256,256>>>(v1,Annz,dev->jmap_d,dev->perm_d,imode,Aa);
4892       PetscCallCUDA(cudaPeekAtLastError());
4893     }
4894 
4895     if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A,&Aa));
4896     else PetscCall(MatSeqAIJCUSPARSERestoreArray(A,&Aa));
4897 
4898     if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void*)v1));
4899   } else {
4900     PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(A,v,imode));
4901   }
4902   PetscFunctionReturn(0);
4903 }
4904 
4905 /*@C
4906     MatSeqAIJCUSPARSEGetIJ - returns the device row storage i and j indices for MATSEQAIJCUSPARSE matrices.
4907 
4908    Not collective
4909 
4910     Input Parameters:
4911 +   A - the matrix
4912 -   compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form
4913 
4914     Output Parameters:
4915 +   ia - the CSR row pointers
4916 -   ja - the CSR column indices
4917 
4918     Level: developer
4919 
4920     Notes:
4921       When compressed is true, the CSR structure does not contain empty rows
4922 
4923 .seealso: `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()`
4924 @*/
4925 PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int** i, const int **j)
4926 {
4927   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4928   CsrMatrix          *csr;
4929   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
4930 
4931   PetscFunctionBegin;
4932   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4933   if (!i || !j) PetscFunctionReturn(0);
4934   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4935   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4936   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4937   PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4938   csr = (CsrMatrix*)cusp->mat->mat;
4939   if (i) {
4940     if (!compressed && a->compressedrow.use) { /* need full row offset */
4941       if (!cusp->rowoffsets_gpu) {
4942         cusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
4943         cusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
4944         PetscCall(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt)));
4945       }
4946       *i = cusp->rowoffsets_gpu->data().get();
4947     } else *i = csr->row_offsets->data().get();
4948   }
4949   if (j) *j = csr->column_indices->data().get();
4950   PetscFunctionReturn(0);
4951 }
4952 
4953 /*@C
4954     MatSeqAIJCUSPARSERestoreIJ - restore the device row storage i and j indices obtained with MatSeqAIJCUSPARSEGetIJ()
4955 
4956    Not collective
4957 
4958     Input Parameters:
4959 +   A - the matrix
4960 -   compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form
4961 
4962     Output Parameters:
4963 +   ia - the CSR row pointers
4964 -   ja - the CSR column indices
4965 
4966     Level: developer
4967 
4968 .seealso: `MatSeqAIJCUSPARSEGetIJ()`
4969 @*/
4970 PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int** i, const int **j)
4971 {
4972   PetscFunctionBegin;
4973   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4974   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4975   if (i) *i = NULL;
4976   if (j) *j = NULL;
4977   PetscFunctionReturn(0);
4978 }
4979 
4980 /*@C
4981    MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
4982 
4983    Not Collective
4984 
4985    Input Parameter:
4986 .   A - a MATSEQAIJCUSPARSE matrix
4987 
4988    Output Parameter:
4989 .   a - pointer to the device data
4990 
4991    Level: developer
4992 
4993    Notes: may trigger host-device copies if up-to-date matrix data is on host
4994 
4995 .seealso: `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()`
4996 @*/
4997 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a)
4998 {
4999   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
5000   CsrMatrix          *csr;
5001 
5002   PetscFunctionBegin;
5003   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
5004   PetscValidPointer(a,2);
5005   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
5006   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
5007   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
5008   PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
5009   csr = (CsrMatrix*)cusp->mat->mat;
5010   PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
5011   *a = csr->values->data().get();
5012   PetscFunctionReturn(0);
5013 }
5014 
5015 /*@C
5016    MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from MatSeqAIJCUSPARSEGetArrayRead()
5017 
5018    Not Collective
5019 
5020    Input Parameter:
5021 .   A - a MATSEQAIJCUSPARSE matrix
5022 
5023    Output Parameter:
5024 .   a - pointer to the device data
5025 
5026    Level: developer
5027 
5028 .seealso: `MatSeqAIJCUSPARSEGetArrayRead()`
5029 @*/
5030 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a)
5031 {
5032   PetscFunctionBegin;
5033   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
5034   PetscValidPointer(a,2);
5035   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
5036   *a = NULL;
5037   PetscFunctionReturn(0);
5038 }
5039 
5040 /*@C
5041    MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
5042 
5043    Not Collective
5044 
5045    Input Parameter:
5046 .   A - a MATSEQAIJCUSPARSE matrix
5047 
5048    Output Parameter:
5049 .   a - pointer to the device data
5050 
5051    Level: developer
5052 
5053    Notes: may trigger host-device copies if up-to-date matrix data is on host
5054 
5055 .seealso: `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()`
5056 @*/
5057 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a)
5058 {
5059   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
5060   CsrMatrix          *csr;
5061 
5062   PetscFunctionBegin;
5063   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
5064   PetscValidPointer(a,2);
5065   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
5066   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
5067   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
5068   PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
5069   csr = (CsrMatrix*)cusp->mat->mat;
5070   PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
5071   *a = csr->values->data().get();
5072   A->offloadmask = PETSC_OFFLOAD_GPU;
5073   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE));
5074   PetscFunctionReturn(0);
5075 }
5076 /*@C
5077    MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from MatSeqAIJCUSPARSEGetArray()
5078 
5079    Not Collective
5080 
5081    Input Parameter:
5082 .   A - a MATSEQAIJCUSPARSE matrix
5083 
5084    Output Parameter:
5085 .   a - pointer to the device data
5086 
5087    Level: developer
5088 
5089 .seealso: `MatSeqAIJCUSPARSEGetArray()`
5090 @*/
5091 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a)
5092 {
5093   PetscFunctionBegin;
5094   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
5095   PetscValidPointer(a,2);
5096   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
5097   PetscCall(MatSeqAIJInvalidateDiagonal(A));
5098   PetscCall(PetscObjectStateIncrease((PetscObject)A));
5099   *a = NULL;
5100   PetscFunctionReturn(0);
5101 }
5102 
5103 /*@C
5104    MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
5105 
5106    Not Collective
5107 
5108    Input Parameter:
5109 .   A - a MATSEQAIJCUSPARSE matrix
5110 
5111    Output Parameter:
5112 .   a - pointer to the device data
5113 
5114    Level: developer
5115 
5116    Notes: does not trigger host-device copies and flags data validity on the GPU
5117 
5118 .seealso: `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()`
5119 @*/
5120 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a)
5121 {
5122   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
5123   CsrMatrix          *csr;
5124 
5125   PetscFunctionBegin;
5126   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
5127   PetscValidPointer(a,2);
5128   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
5129   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
5130   PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
5131   csr = (CsrMatrix*)cusp->mat->mat;
5132   PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
5133   *a = csr->values->data().get();
5134   A->offloadmask = PETSC_OFFLOAD_GPU;
5135   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE));
5136   PetscFunctionReturn(0);
5137 }
5138 
5139 /*@C
5140    MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from MatSeqAIJCUSPARSEGetArrayWrite()
5141 
5142    Not Collective
5143 
5144    Input Parameter:
5145 .   A - a MATSEQAIJCUSPARSE matrix
5146 
5147    Output Parameter:
5148 .   a - pointer to the device data
5149 
5150    Level: developer
5151 
5152 .seealso: `MatSeqAIJCUSPARSEGetArrayWrite()`
5153 @*/
5154 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a)
5155 {
5156   PetscFunctionBegin;
5157   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
5158   PetscValidPointer(a,2);
5159   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
5160   PetscCall(MatSeqAIJInvalidateDiagonal(A));
5161   PetscCall(PetscObjectStateIncrease((PetscObject)A));
5162   *a = NULL;
5163   PetscFunctionReturn(0);
5164 }
5165 
5166 struct IJCompare4
5167 {
5168   __host__ __device__
5169   inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
5170   {
5171     if (t1.get<0>() < t2.get<0>()) return true;
5172     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
5173     return false;
5174   }
5175 };
5176 
5177 struct Shift
5178 {
5179   int _shift;
5180 
5181   Shift(int shift) : _shift(shift) {}
5182   __host__ __device__
5183   inline int operator() (const int &c)
5184   {
5185     return c + _shift;
5186   }
5187 };
5188 
5189 /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */
5190 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
5191 {
5192   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c;
5193   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp;
5194   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
5195   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
5196   PetscInt                     Annz,Bnnz;
5197   cusparseStatus_t             stat;
5198   PetscInt                     i,m,n,zero = 0;
5199 
5200   PetscFunctionBegin;
5201   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
5202   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
5203   PetscValidPointer(C,4);
5204   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
5205   PetscCheckTypeName(B,MATSEQAIJCUSPARSE);
5206   PetscCheck(A->rmap->n == B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n);
5207   PetscCheck(reuse != MAT_INPLACE_MATRIX,PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
5208   PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
5209   PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
5210   if (reuse == MAT_INITIAL_MATRIX) {
5211     m     = A->rmap->n;
5212     n     = A->cmap->n + B->cmap->n;
5213     PetscCall(MatCreate(PETSC_COMM_SELF,C));
5214     PetscCall(MatSetSizes(*C,m,n,m,n));
5215     PetscCall(MatSetType(*C,MATSEQAIJCUSPARSE));
5216     c     = (Mat_SeqAIJ*)(*C)->data;
5217     Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
5218     Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
5219     Ccsr  = new CsrMatrix;
5220     Cmat->cprowIndices      = NULL;
5221     c->compressedrow.use    = PETSC_FALSE;
5222     c->compressedrow.nrows  = 0;
5223     c->compressedrow.i      = NULL;
5224     c->compressedrow.rindex = NULL;
5225     Ccusp->workVector       = NULL;
5226     Ccusp->nrows    = m;
5227     Ccusp->mat      = Cmat;
5228     Ccusp->mat->mat = Ccsr;
5229     Ccsr->num_rows  = m;
5230     Ccsr->num_cols  = n;
5231     PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
5232     PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
5233     PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
5234     PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar)));
5235     PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar)));
5236     PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar)));
5237     PetscCallCUDA(cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
5238     PetscCallCUDA(cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice));
5239     PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
5240     PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
5241     PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
5242     PetscCheck(Acusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
5243     PetscCheck(Bcusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
5244 
5245     Acsr = (CsrMatrix*)Acusp->mat->mat;
5246     Bcsr = (CsrMatrix*)Bcusp->mat->mat;
5247     Annz = (PetscInt)Acsr->column_indices->size();
5248     Bnnz = (PetscInt)Bcsr->column_indices->size();
5249     c->nz = Annz + Bnnz;
5250     Ccsr->row_offsets = new THRUSTINTARRAY32(m+1);
5251     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
5252     Ccsr->values = new THRUSTARRAY(c->nz);
5253     Ccsr->num_entries = c->nz;
5254     Ccusp->cooPerm = new THRUSTINTARRAY(c->nz);
5255     if (c->nz) {
5256       auto Acoo = new THRUSTINTARRAY32(Annz);
5257       auto Bcoo = new THRUSTINTARRAY32(Bnnz);
5258       auto Ccoo = new THRUSTINTARRAY32(c->nz);
5259       THRUSTINTARRAY32 *Aroff,*Broff;
5260 
5261       if (a->compressedrow.use) { /* need full row offset */
5262         if (!Acusp->rowoffsets_gpu) {
5263           Acusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
5264           Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
5265           PetscCall(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt)));
5266         }
5267         Aroff = Acusp->rowoffsets_gpu;
5268       } else Aroff = Acsr->row_offsets;
5269       if (b->compressedrow.use) { /* need full row offset */
5270         if (!Bcusp->rowoffsets_gpu) {
5271           Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
5272           Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
5273           PetscCall(PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt)));
5274         }
5275         Broff = Bcusp->rowoffsets_gpu;
5276       } else Broff = Bcsr->row_offsets;
5277       PetscCall(PetscLogGpuTimeBegin());
5278       stat = cusparseXcsr2coo(Acusp->handle,
5279                               Aroff->data().get(),
5280                               Annz,
5281                               m,
5282                               Acoo->data().get(),
5283                               CUSPARSE_INDEX_BASE_ZERO);PetscCallCUSPARSE(stat);
5284       stat = cusparseXcsr2coo(Bcusp->handle,
5285                               Broff->data().get(),
5286                               Bnnz,
5287                               m,
5288                               Bcoo->data().get(),
5289                               CUSPARSE_INDEX_BASE_ZERO);PetscCallCUSPARSE(stat);
5290       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
5291       auto Aperm = thrust::make_constant_iterator(1);
5292       auto Bperm = thrust::make_constant_iterator(0);
5293 #if PETSC_PKG_CUDA_VERSION_GE(10,0,0)
5294       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n));
5295       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n));
5296 #else
5297       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
5298       auto Bcib = Bcsr->column_indices->begin();
5299       auto Bcie = Bcsr->column_indices->end();
5300       thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n));
5301 #endif
5302       auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz);
5303       auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm));
5304       auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm));
5305       auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm));
5306       auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm));
5307       auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin()));
5308       auto p1 = Ccusp->cooPerm->begin();
5309       auto p2 = Ccusp->cooPerm->begin();
5310       thrust::advance(p2,Annz);
5311       PetscCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4()));
5312 #if PETSC_PKG_CUDA_VERSION_LT(10,0,0)
5313       thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n));
5314 #endif
5315       auto cci = thrust::make_counting_iterator(zero);
5316       auto cce = thrust::make_counting_iterator(c->nz);
5317 #if 0 //Errors on SUMMIT cuda 11.1.0
5318       PetscCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
5319 #else
5320       auto pred = thrust::identity<int>();
5321       PetscCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred));
5322       PetscCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred));
5323 #endif
5324       stat = cusparseXcoo2csr(Ccusp->handle,
5325                               Ccoo->data().get(),
5326                               c->nz,
5327                               m,
5328                               Ccsr->row_offsets->data().get(),
5329                               CUSPARSE_INDEX_BASE_ZERO);PetscCallCUSPARSE(stat);
5330       PetscCall(PetscLogGpuTimeEnd());
5331       delete wPerm;
5332       delete Acoo;
5333       delete Bcoo;
5334       delete Ccoo;
5335 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
5336       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries,
5337                                Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(),
5338                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
5339                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat);
5340 #endif
5341       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
5342         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
5343         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
5344         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
5345         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
5346         CsrMatrix *CcsrT = new CsrMatrix;
5347         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
5348         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
5349 
5350         (*C)->form_explicit_transpose = PETSC_TRUE;
5351         (*C)->transupdated = PETSC_TRUE;
5352         Ccusp->rowoffsets_gpu = NULL;
5353         CmatT->cprowIndices = NULL;
5354         CmatT->mat = CcsrT;
5355         CcsrT->num_rows = n;
5356         CcsrT->num_cols = m;
5357         CcsrT->num_entries = c->nz;
5358 
5359         CcsrT->row_offsets = new THRUSTINTARRAY32(n+1);
5360         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
5361         CcsrT->values = new THRUSTARRAY(c->nz);
5362 
5363         PetscCall(PetscLogGpuTimeBegin());
5364         auto rT = CcsrT->row_offsets->begin();
5365         if (AT) {
5366           rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT);
5367           thrust::advance(rT,-1);
5368         }
5369         if (BT) {
5370           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz));
5371           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz));
5372           thrust::copy(titb,tite,rT);
5373         }
5374         auto cT = CcsrT->column_indices->begin();
5375         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT);
5376         if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT);
5377         auto vT = CcsrT->values->begin();
5378         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
5379         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
5380         PetscCall(PetscLogGpuTimeEnd());
5381 
5382         PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr));
5383         PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO));
5384         PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
5385         PetscCallCUDA(cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar)));
5386         PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar)));
5387         PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar)));
5388         PetscCallCUDA(cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
5389         PetscCallCUDA(cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice));
5390         PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
5391 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
5392         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries,
5393                                  CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(),
5394                                  CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
5395                                  CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat);
5396 #endif
5397         Ccusp->matTranspose = CmatT;
5398       }
5399     }
5400 
5401     c->singlemalloc = PETSC_FALSE;
5402     c->free_a       = PETSC_TRUE;
5403     c->free_ij      = PETSC_TRUE;
5404     PetscCall(PetscMalloc1(m+1,&c->i));
5405     PetscCall(PetscMalloc1(c->nz,&c->j));
5406     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
5407       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
5408       THRUSTINTARRAY jj(Ccsr->column_indices->size());
5409       ii   = *Ccsr->row_offsets;
5410       jj   = *Ccsr->column_indices;
5411       PetscCallCUDA(cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
5412       PetscCallCUDA(cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
5413     } else {
5414       PetscCallCUDA(cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
5415       PetscCallCUDA(cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
5416     }
5417     PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt)));
5418     PetscCall(PetscMalloc1(m,&c->ilen));
5419     PetscCall(PetscMalloc1(m,&c->imax));
5420     c->maxnz = c->nz;
5421     c->nonzerorowcnt = 0;
5422     c->rmax = 0;
5423     for (i = 0; i < m; i++) {
5424       const PetscInt nn = c->i[i+1] - c->i[i];
5425       c->ilen[i] = c->imax[i] = nn;
5426       c->nonzerorowcnt += (PetscInt)!!nn;
5427       c->rmax = PetscMax(c->rmax,nn);
5428     }
5429     PetscCall(MatMarkDiagonal_SeqAIJ(*C));
5430     PetscCall(PetscMalloc1(c->nz,&c->a));
5431     (*C)->nonzerostate++;
5432     PetscCall(PetscLayoutSetUp((*C)->rmap));
5433     PetscCall(PetscLayoutSetUp((*C)->cmap));
5434     Ccusp->nonzerostate = (*C)->nonzerostate;
5435     (*C)->preallocated  = PETSC_TRUE;
5436   } else {
5437     PetscCheck((*C)->rmap->n == B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,(*C)->rmap->n,B->rmap->n);
5438     c = (Mat_SeqAIJ*)(*C)->data;
5439     if (c->nz) {
5440       Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
5441       PetscCheck(Ccusp->cooPerm,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm");
5442       PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
5443       PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate,PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate");
5444       PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
5445       PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
5446       PetscCheck(Acusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
5447       PetscCheck(Bcusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
5448       Acsr = (CsrMatrix*)Acusp->mat->mat;
5449       Bcsr = (CsrMatrix*)Bcusp->mat->mat;
5450       Ccsr = (CsrMatrix*)Ccusp->mat->mat;
5451       PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %" PetscInt_FMT " != %" PetscInt_FMT,Acsr->num_entries,(PetscInt)Acsr->values->size());
5452       PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %" PetscInt_FMT " != %" PetscInt_FMT,Bcsr->num_entries,(PetscInt)Bcsr->values->size());
5453       PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %" PetscInt_FMT " != %" PetscInt_FMT,Ccsr->num_entries,(PetscInt)Ccsr->values->size());
5454       PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries,PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT,Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries);
5455       PetscCheck(Ccusp->cooPerm->size() == Ccsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %" PetscInt_FMT " != %" PetscInt_FMT,(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size());
5456       auto pmid = Ccusp->cooPerm->begin();
5457       thrust::advance(pmid,Acsr->num_entries);
5458       PetscCall(PetscLogGpuTimeBegin());
5459       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(),
5460                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin())));
5461       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(),
5462                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
5463       thrust::for_each(zibait,zieait,VecCUDAEquals());
5464       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(),
5465                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
5466       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(),
5467                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end())));
5468       thrust::for_each(zibbit,ziebit,VecCUDAEquals());
5469       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE));
5470       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
5471         PetscCheck(Ccusp->matTranspose,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
5472         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
5473         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
5474         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
5475         CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat;
5476         auto vT = CcsrT->values->begin();
5477         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
5478         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
5479         (*C)->transupdated = PETSC_TRUE;
5480       }
5481       PetscCall(PetscLogGpuTimeEnd());
5482     }
5483   }
5484   PetscCall(PetscObjectStateIncrease((PetscObject)*C));
5485   (*C)->assembled     = PETSC_TRUE;
5486   (*C)->was_assembled = PETSC_FALSE;
5487   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
5488   PetscFunctionReturn(0);
5489 }
5490 
5491 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
5492 {
5493   bool              dmem;
5494   const PetscScalar *av;
5495 
5496   PetscFunctionBegin;
5497   dmem = isCudaMem(v);
5498   PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A,&av));
5499   if (n && idx) {
5500     THRUSTINTARRAY widx(n);
5501     widx.assign(idx,idx+n);
5502     PetscCall(PetscLogCpuToGpu(n*sizeof(PetscInt)));
5503 
5504     THRUSTARRAY *w = NULL;
5505     thrust::device_ptr<PetscScalar> dv;
5506     if (dmem) {
5507       dv = thrust::device_pointer_cast(v);
5508     } else {
5509       w = new THRUSTARRAY(n);
5510       dv = w->data();
5511     }
5512     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
5513 
5514     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv));
5515     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n));
5516     thrust::for_each(zibit,zieit,VecCUDAEquals());
5517     if (w) {
5518       PetscCallCUDA(cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost));
5519     }
5520     delete w;
5521   } else {
5522     PetscCallCUDA(cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost));
5523   }
5524   if (!dmem) PetscCall(PetscLogCpuToGpu(n*sizeof(PetscScalar)));
5525   PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A,&av));
5526   PetscFunctionReturn(0);
5527 }
5528