xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 12ba2bc62b18c623c4aa6aaa43d22510cfab07b4)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
7 
8 #include <petscconf.h>
9 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
10 #include <../src/mat/impls/sbaij/seq/sbaij.h>
11 #include <../src/vec/vec/impls/dvecimpl.h>
12 #include <petsc/private/vecimpl.h>
13 #undef VecType
14 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
15 #include <thrust/adjacent_difference.h>
16 #include <thrust/async/for_each.h>
17 #include <thrust/iterator/constant_iterator.h>
18 #include <thrust/remove.h>
19 #include <thrust/sort.h>
20 #include <thrust/unique.h>
21 
22 const char *const MatCUSPARSEStorageFormats[]    = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
23 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
24   /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
25     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
26 
27   typedef enum {
28       CUSPARSE_MV_ALG_DEFAULT = 0,
29       CUSPARSE_COOMV_ALG      = 1,
30       CUSPARSE_CSRMV_ALG1     = 2,
31       CUSPARSE_CSRMV_ALG2     = 3
32   } cusparseSpMVAlg_t;
33 
34   typedef enum {
35       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
36       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
37       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
38       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
39       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
40       CUSPARSE_SPMM_ALG_DEFAULT = 0,
41       CUSPARSE_SPMM_COO_ALG1    = 1,
42       CUSPARSE_SPMM_COO_ALG2    = 2,
43       CUSPARSE_SPMM_COO_ALG3    = 3,
44       CUSPARSE_SPMM_COO_ALG4    = 5,
45       CUSPARSE_SPMM_CSR_ALG1    = 4,
46       CUSPARSE_SPMM_CSR_ALG2    = 6,
47   } cusparseSpMMAlg_t;
48 
49   typedef enum {
50       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
51       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
52   } cusparseCsr2CscAlg_t;
53   */
54   const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
55   const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
56   const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
57 #endif
58 
59 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
60 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
61 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
62 
63 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
64 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
65 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
66 
67 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
68 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
69 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
70 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
71 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
72 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure);
73 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar);
74 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
75 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
76 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
77 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
78 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
79 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
80 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
81 
82 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
83 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
84 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
85 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
86 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
87 
88 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
89 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool);
90 
91 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]);
92 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscCount,const PetscInt[],const PetscInt[]);
93 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode);
94 
95 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
96 {
97   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
98 
99   PetscFunctionBegin;
100   switch (op) {
101   case MAT_CUSPARSE_MULT:
102     cusparsestruct->format = format;
103     break;
104   case MAT_CUSPARSE_ALL:
105     cusparsestruct->format = format;
106     break;
107   default:
108     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
109   }
110   PetscFunctionReturn(0);
111 }
112 
113 /*@
114    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
115    operation. Only the MatMult operation can use different GPU storage formats
116    for MPIAIJCUSPARSE matrices.
117    Not Collective
118 
119    Input Parameters:
120 +  A - Matrix of type SEQAIJCUSPARSE
121 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
122 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
123 
124    Output Parameter:
125 
126    Level: intermediate
127 
128 .seealso: `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
129 @*/
130 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
131 {
132   PetscFunctionBegin;
133   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
134   PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));
135   PetscFunctionReturn(0);
136 }
137 
138 PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A,PetscBool use_cpu)
139 {
140   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
141 
142   PetscFunctionBegin;
143   cusparsestruct->use_cpu_solve = use_cpu;
144   PetscFunctionReturn(0);
145 }
146 
147 /*@
148    MatCUSPARSESetUseCPUSolve - Sets use CPU MatSolve.
149 
150    Input Parameters:
151 +  A - Matrix of type SEQAIJCUSPARSE
152 -  use_cpu - set flag for using the built-in CPU MatSolve
153 
154    Output Parameter:
155 
156    Notes:
157    The cuSparse LU solver currently computes the factors with the built-in CPU method
158    and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there.
159    This method to specify if the solve is done on the CPU or GPU (GPU is the default).
160 
161    Level: intermediate
162 
163 .seealso: `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
164 @*/
165 PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A,PetscBool use_cpu)
166 {
167   PetscFunctionBegin;
168   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
169   PetscTryMethod(A,"MatCUSPARSESetUseCPUSolve_C",(Mat,PetscBool),(A,use_cpu));
170   PetscFunctionReturn(0);
171 }
172 
173 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg)
174 {
175   PetscFunctionBegin;
176   switch (op) {
177     case MAT_FORM_EXPLICIT_TRANSPOSE:
178       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
179       if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE));
180       A->form_explicit_transpose = flg;
181       break;
182     default:
183       PetscCall(MatSetOption_SeqAIJ(A,op,flg));
184       break;
185   }
186   PetscFunctionReturn(0);
187 }
188 
189 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A);
190 
191 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
192 {
193   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
194   IS             isrow = b->row,iscol = b->col;
195   PetscBool      row_identity,col_identity;
196   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)B->spptr;
197 
198   PetscFunctionBegin;
199   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
200   PetscCall(MatLUFactorNumeric_SeqAIJ(B,A,info));
201   B->offloadmask = PETSC_OFFLOAD_CPU;
202   /* determine which version of MatSolve needs to be used. */
203   PetscCall(ISIdentity(isrow,&row_identity));
204   PetscCall(ISIdentity(iscol,&col_identity));
205 
206   if (!cusparsestruct->use_cpu_solve) {
207     if (row_identity && col_identity) {
208       B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
209       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
210     } else {
211       B->ops->solve = MatSolve_SeqAIJCUSPARSE;
212       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
213     }
214   }
215   B->ops->matsolve = NULL;
216   B->ops->matsolvetranspose = NULL;
217 
218   /* get the triangular factors */
219   if (!cusparsestruct->use_cpu_solve) {
220     PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B));
221   }
222   PetscFunctionReturn(0);
223 }
224 
225 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
226 {
227   MatCUSPARSEStorageFormat format;
228   PetscBool                flg;
229   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
230 
231   PetscFunctionBegin;
232   PetscOptionsHeadBegin(PetscOptionsObject,"SeqAIJCUSPARSE options");
233   if (A->factortype == MAT_FACTOR_NONE) {
234     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
235                                "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg));
236     if (flg) PetscCall(MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format));
237 
238     PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
239                                "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg));
240     if (flg) PetscCall(MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format));
241     PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve","Use CPU (I)LU solve","MatCUSPARSESetUseCPUSolve",cusparsestruct->use_cpu_solve,&cusparsestruct->use_cpu_solve,&flg));
242     if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A,cusparsestruct->use_cpu_solve));
243 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
244     PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
245                                "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg));
246     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
247 #if CUSPARSE_VERSION > 11301
248     PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
249 #else
250     PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
251 #endif
252     PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
253                                "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg));
254     PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
255 
256     PetscCall(PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
257                                "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg));
258     PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
259    #endif
260   }
261   PetscOptionsHeadEnd();
262   PetscFunctionReturn(0);
263 }
264 
265 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
266 {
267   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
268   PetscInt                          n = A->rmap->n;
269   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
270   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
271   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
272   const MatScalar                   *aa = a->a,*v;
273   PetscInt                          *AiLo, *AjLo;
274   PetscInt                          i,nz, nzLower, offset, rowOffset;
275 
276   PetscFunctionBegin;
277   if (!n) PetscFunctionReturn(0);
278   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
279     try {
280       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
281       nzLower=n+ai[n]-ai[1];
282       if (!loTriFactor) {
283         PetscScalar                       *AALo;
284 
285         PetscCallCUDA(cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar)));
286 
287         /* Allocate Space for the lower triangular matrix */
288         PetscCallCUDA(cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt)));
289         PetscCallCUDA(cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt)));
290 
291         /* Fill the lower triangular matrix */
292         AiLo[0]  = (PetscInt) 0;
293         AiLo[n]  = nzLower;
294         AjLo[0]  = (PetscInt) 0;
295         AALo[0]  = (MatScalar) 1.0;
296         v        = aa;
297         vi       = aj;
298         offset   = 1;
299         rowOffset= 1;
300         for (i=1; i<n; i++) {
301           nz = ai[i+1] - ai[i];
302           /* additional 1 for the term on the diagonal */
303           AiLo[i]    = rowOffset;
304           rowOffset += nz+1;
305 
306           PetscCall(PetscArraycpy(&(AjLo[offset]), vi, nz));
307           PetscCall(PetscArraycpy(&(AALo[offset]), v, nz));
308 
309           offset      += nz;
310           AjLo[offset] = (PetscInt) i;
311           AALo[offset] = (MatScalar) 1.0;
312           offset      += 1;
313 
314           v  += nz;
315           vi += nz;
316         }
317 
318         /* allocate space for the triangular factor information */
319         PetscCall(PetscNew(&loTriFactor));
320         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
321         /* Create the matrix description */
322         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
323         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
324        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
325         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
326        #else
327         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
328        #endif
329         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER));
330         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
331 
332         /* set the operation */
333         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
334 
335         /* set the matrix */
336         loTriFactor->csrMat = new CsrMatrix;
337         loTriFactor->csrMat->num_rows = n;
338         loTriFactor->csrMat->num_cols = n;
339         loTriFactor->csrMat->num_entries = nzLower;
340 
341         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
342         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
343 
344         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
345         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
346 
347         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
348         loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
349 
350         /* Create the solve analysis information */
351         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
352         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
353       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
354         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
355                                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
356                                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
357                                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
358                                                &loTriFactor->solveBufferSize));
359         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize));
360       #endif
361 
362         /* perform the solve analysis */
363         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
364                                          loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
365                                          loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
366                                          loTriFactor->csrMat->column_indices->data().get(),
367                                          #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
368                                          loTriFactor->solveInfo,
369                                          loTriFactor->solvePolicy, loTriFactor->solveBuffer));
370                                          #else
371                                          loTriFactor->solveInfo));
372                                          #endif
373         PetscCallCUDA(WaitForCUDA());
374         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
375 
376         /* assign the pointer */
377         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
378         loTriFactor->AA_h = AALo;
379         PetscCallCUDA(cudaFreeHost(AiLo));
380         PetscCallCUDA(cudaFreeHost(AjLo));
381         PetscCall(PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar)));
382       } else { /* update values only */
383         if (!loTriFactor->AA_h) {
384           PetscCallCUDA(cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar)));
385         }
386         /* Fill the lower triangular matrix */
387         loTriFactor->AA_h[0]  = 1.0;
388         v        = aa;
389         vi       = aj;
390         offset   = 1;
391         for (i=1; i<n; i++) {
392           nz = ai[i+1] - ai[i];
393           PetscCall(PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz));
394           offset      += nz;
395           loTriFactor->AA_h[offset] = 1.0;
396           offset      += 1;
397           v  += nz;
398         }
399         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower);
400         PetscCall(PetscLogCpuToGpu(nzLower*sizeof(PetscScalar)));
401       }
402     } catch(char *ex) {
403       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
404     }
405   }
406   PetscFunctionReturn(0);
407 }
408 
409 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
410 {
411   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
412   PetscInt                          n = A->rmap->n;
413   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
414   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
415   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
416   const MatScalar                   *aa = a->a,*v;
417   PetscInt                          *AiUp, *AjUp;
418   PetscInt                          i,nz, nzUpper, offset;
419 
420   PetscFunctionBegin;
421   if (!n) PetscFunctionReturn(0);
422   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
423     try {
424       /* next, figure out the number of nonzeros in the upper triangular matrix. */
425       nzUpper = adiag[0]-adiag[n];
426       if (!upTriFactor) {
427         PetscScalar *AAUp;
428 
429         PetscCallCUDA(cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar)));
430 
431         /* Allocate Space for the upper triangular matrix */
432         PetscCallCUDA(cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt)));
433         PetscCallCUDA(cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt)));
434 
435         /* Fill the upper triangular matrix */
436         AiUp[0]=(PetscInt) 0;
437         AiUp[n]=nzUpper;
438         offset = nzUpper;
439         for (i=n-1; i>=0; i--) {
440           v  = aa + adiag[i+1] + 1;
441           vi = aj + adiag[i+1] + 1;
442 
443           /* number of elements NOT on the diagonal */
444           nz = adiag[i] - adiag[i+1]-1;
445 
446           /* decrement the offset */
447           offset -= (nz+1);
448 
449           /* first, set the diagonal elements */
450           AjUp[offset] = (PetscInt) i;
451           AAUp[offset] = (MatScalar)1./v[nz];
452           AiUp[i]      = AiUp[i+1] - (nz+1);
453 
454           PetscCall(PetscArraycpy(&(AjUp[offset+1]), vi, nz));
455           PetscCall(PetscArraycpy(&(AAUp[offset+1]), v, nz));
456         }
457 
458         /* allocate space for the triangular factor information */
459         PetscCall(PetscNew(&upTriFactor));
460         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
461 
462         /* Create the matrix description */
463         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
464         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
465        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
466         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
467        #else
468         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
469        #endif
470         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
471         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
472 
473         /* set the operation */
474         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
475 
476         /* set the matrix */
477         upTriFactor->csrMat = new CsrMatrix;
478         upTriFactor->csrMat->num_rows = n;
479         upTriFactor->csrMat->num_cols = n;
480         upTriFactor->csrMat->num_entries = nzUpper;
481 
482         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
483         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
484 
485         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
486         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
487 
488         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
489         upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
490 
491         /* Create the solve analysis information */
492         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
493         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
494       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
495         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
496                                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
497                                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
498                                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
499                                                &upTriFactor->solveBufferSize));
500         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize));
501       #endif
502 
503         /* perform the solve analysis */
504         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
505                                          upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
506                                          upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
507                                          upTriFactor->csrMat->column_indices->data().get(),
508                                          #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
509                                          upTriFactor->solveInfo,
510                                          upTriFactor->solvePolicy, upTriFactor->solveBuffer));
511                                          #else
512                                          upTriFactor->solveInfo));
513                                          #endif
514         PetscCallCUDA(WaitForCUDA());
515         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
516 
517         /* assign the pointer */
518         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
519         upTriFactor->AA_h = AAUp;
520         PetscCallCUDA(cudaFreeHost(AiUp));
521         PetscCallCUDA(cudaFreeHost(AjUp));
522         PetscCall(PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar)));
523       } else {
524         if (!upTriFactor->AA_h) {
525           PetscCallCUDA(cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar)));
526         }
527         /* Fill the upper triangular matrix */
528         offset = nzUpper;
529         for (i=n-1; i>=0; i--) {
530           v  = aa + adiag[i+1] + 1;
531 
532           /* number of elements NOT on the diagonal */
533           nz = adiag[i] - adiag[i+1]-1;
534 
535           /* decrement the offset */
536           offset -= (nz+1);
537 
538           /* first, set the diagonal elements */
539           upTriFactor->AA_h[offset] = 1./v[nz];
540           PetscCall(PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz));
541         }
542         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper);
543         PetscCall(PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar)));
544       }
545     } catch(char *ex) {
546       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
547     }
548   }
549   PetscFunctionReturn(0);
550 }
551 
552 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
553 {
554   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
555   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
556   IS                           isrow = a->row,iscol = a->icol;
557   PetscBool                    row_identity,col_identity;
558   PetscInt                     n = A->rmap->n;
559 
560   PetscFunctionBegin;
561   PetscCheck(cusparseTriFactors,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
562   PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A));
563   PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A));
564 
565   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
566   cusparseTriFactors->nnz=a->nz;
567 
568   A->offloadmask = PETSC_OFFLOAD_BOTH;
569   /* lower triangular indices */
570   PetscCall(ISIdentity(isrow,&row_identity));
571   if (!row_identity && !cusparseTriFactors->rpermIndices) {
572     const PetscInt *r;
573 
574     PetscCall(ISGetIndices(isrow,&r));
575     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
576     cusparseTriFactors->rpermIndices->assign(r, r+n);
577     PetscCall(ISRestoreIndices(isrow,&r));
578     PetscCall(PetscLogCpuToGpu(n*sizeof(PetscInt)));
579   }
580 
581   /* upper triangular indices */
582   PetscCall(ISIdentity(iscol,&col_identity));
583   if (!col_identity && !cusparseTriFactors->cpermIndices) {
584     const PetscInt *c;
585 
586     PetscCall(ISGetIndices(iscol,&c));
587     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
588     cusparseTriFactors->cpermIndices->assign(c, c+n);
589     PetscCall(ISRestoreIndices(iscol,&c));
590     PetscCall(PetscLogCpuToGpu(n*sizeof(PetscInt)));
591   }
592   PetscFunctionReturn(0);
593 }
594 
595 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
596 {
597   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
598   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
599   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
600   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
601   PetscInt                          *AiUp, *AjUp;
602   PetscScalar                       *AAUp;
603   PetscScalar                       *AALo;
604   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
605   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
606   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
607   const MatScalar                   *aa = b->a,*v;
608 
609   PetscFunctionBegin;
610   if (!n) PetscFunctionReturn(0);
611   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
612     try {
613       PetscCallCUDA(cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar)));
614       PetscCallCUDA(cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar)));
615       if (!upTriFactor && !loTriFactor) {
616         /* Allocate Space for the upper triangular matrix */
617         PetscCallCUDA(cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt)));
618         PetscCallCUDA(cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt)));
619 
620         /* Fill the upper triangular matrix */
621         AiUp[0]=(PetscInt) 0;
622         AiUp[n]=nzUpper;
623         offset = 0;
624         for (i=0; i<n; i++) {
625           /* set the pointers */
626           v  = aa + ai[i];
627           vj = aj + ai[i];
628           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
629 
630           /* first, set the diagonal elements */
631           AjUp[offset] = (PetscInt) i;
632           AAUp[offset] = (MatScalar)1.0/v[nz];
633           AiUp[i]      = offset;
634           AALo[offset] = (MatScalar)1.0/v[nz];
635 
636           offset+=1;
637           if (nz>0) {
638             PetscCall(PetscArraycpy(&(AjUp[offset]), vj, nz));
639             PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz));
640             for (j=offset; j<offset+nz; j++) {
641               AAUp[j] = -AAUp[j];
642               AALo[j] = AAUp[j]/v[nz];
643             }
644             offset+=nz;
645           }
646         }
647 
648         /* allocate space for the triangular factor information */
649         PetscCall(PetscNew(&upTriFactor));
650         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
651 
652         /* Create the matrix description */
653         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
654         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
655        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
656         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
657        #else
658         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
659        #endif
660         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
661         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
662 
663         /* set the matrix */
664         upTriFactor->csrMat = new CsrMatrix;
665         upTriFactor->csrMat->num_rows = A->rmap->n;
666         upTriFactor->csrMat->num_cols = A->cmap->n;
667         upTriFactor->csrMat->num_entries = a->nz;
668 
669         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
670         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
671 
672         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
673         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
674 
675         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
676         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
677 
678         /* set the operation */
679         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
680 
681         /* Create the solve analysis information */
682         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
683         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
684       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
685         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
686                                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
687                                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
688                                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
689                                                &upTriFactor->solveBufferSize));
690         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize));
691       #endif
692 
693         /* perform the solve analysis */
694         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
695                                          upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
696                                          upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
697                                          upTriFactor->csrMat->column_indices->data().get(),
698                                          #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
699                                          upTriFactor->solveInfo,
700                                          upTriFactor->solvePolicy, upTriFactor->solveBuffer));
701                                          #else
702                                          upTriFactor->solveInfo));
703                                          #endif
704         PetscCallCUDA(WaitForCUDA());
705         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
706 
707         /* assign the pointer */
708         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
709 
710         /* allocate space for the triangular factor information */
711         PetscCall(PetscNew(&loTriFactor));
712         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
713 
714         /* Create the matrix description */
715         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
716         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
717        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
718         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
719        #else
720         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
721        #endif
722         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
723         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
724 
725         /* set the operation */
726         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
727 
728         /* set the matrix */
729         loTriFactor->csrMat = new CsrMatrix;
730         loTriFactor->csrMat->num_rows = A->rmap->n;
731         loTriFactor->csrMat->num_cols = A->cmap->n;
732         loTriFactor->csrMat->num_entries = a->nz;
733 
734         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
735         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
736 
737         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
738         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
739 
740         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
741         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
742 
743         /* Create the solve analysis information */
744         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
745         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
746       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
747         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
748                                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
749                                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
750                                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
751                                                &loTriFactor->solveBufferSize));
752         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize));
753       #endif
754 
755         /* perform the solve analysis */
756         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
757                                          loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
758                                          loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
759                                          loTriFactor->csrMat->column_indices->data().get(),
760                                          #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
761                                          loTriFactor->solveInfo,
762                                          loTriFactor->solvePolicy, loTriFactor->solveBuffer));
763                                          #else
764                                          loTriFactor->solveInfo));
765                                          #endif
766         PetscCallCUDA(WaitForCUDA());
767         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
768 
769         /* assign the pointer */
770         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
771 
772         PetscCall(PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar))));
773         PetscCallCUDA(cudaFreeHost(AiUp));
774         PetscCallCUDA(cudaFreeHost(AjUp));
775       } else {
776         /* Fill the upper triangular matrix */
777         offset = 0;
778         for (i=0; i<n; i++) {
779           /* set the pointers */
780           v  = aa + ai[i];
781           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
782 
783           /* first, set the diagonal elements */
784           AAUp[offset] = 1.0/v[nz];
785           AALo[offset] = 1.0/v[nz];
786 
787           offset+=1;
788           if (nz>0) {
789             PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz));
790             for (j=offset; j<offset+nz; j++) {
791               AAUp[j] = -AAUp[j];
792               AALo[j] = AAUp[j]/v[nz];
793             }
794             offset+=nz;
795           }
796         }
797         PetscCheck(upTriFactor,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
798         PetscCheck(loTriFactor,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
799         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
800         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
801         PetscCall(PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar)));
802       }
803       PetscCallCUDA(cudaFreeHost(AAUp));
804       PetscCallCUDA(cudaFreeHost(AALo));
805     } catch(char *ex) {
806       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
807     }
808   }
809   PetscFunctionReturn(0);
810 }
811 
812 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
813 {
814   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
815   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
816   IS                           ip = a->row;
817   PetscBool                    perm_identity;
818   PetscInt                     n = A->rmap->n;
819 
820   PetscFunctionBegin;
821   PetscCheck(cusparseTriFactors,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
822   PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A));
823   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
824   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
825 
826   A->offloadmask = PETSC_OFFLOAD_BOTH;
827 
828   /* lower triangular indices */
829   PetscCall(ISIdentity(ip,&perm_identity));
830   if (!perm_identity) {
831     IS             iip;
832     const PetscInt *irip,*rip;
833 
834     PetscCall(ISInvertPermutation(ip,PETSC_DECIDE,&iip));
835     PetscCall(ISGetIndices(iip,&irip));
836     PetscCall(ISGetIndices(ip,&rip));
837     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
838     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
839     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
840     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
841     PetscCall(ISRestoreIndices(iip,&irip));
842     PetscCall(ISDestroy(&iip));
843     PetscCall(ISRestoreIndices(ip,&rip));
844     PetscCall(PetscLogCpuToGpu(2.*n*sizeof(PetscInt)));
845   }
846   PetscFunctionReturn(0);
847 }
848 
849 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
850 {
851   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
852   IS             ip = b->row;
853   PetscBool      perm_identity;
854 
855   PetscFunctionBegin;
856   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
857   PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B,A,info));
858   B->offloadmask = PETSC_OFFLOAD_CPU;
859   /* determine which version of MatSolve needs to be used. */
860   PetscCall(ISIdentity(ip,&perm_identity));
861   if (perm_identity) {
862     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
863     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
864     B->ops->matsolve = NULL;
865     B->ops->matsolvetranspose = NULL;
866   } else {
867     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
868     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
869     B->ops->matsolve = NULL;
870     B->ops->matsolvetranspose = NULL;
871   }
872 
873   /* get the triangular factors */
874   PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B));
875   PetscFunctionReturn(0);
876 }
877 
878 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
879 {
880   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
881   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
882   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
883   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
884   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
885   cusparseIndexBase_t               indexBase;
886   cusparseMatrixType_t              matrixType;
887   cusparseFillMode_t                fillMode;
888   cusparseDiagType_t                diagType;
889 
890   PetscFunctionBegin;
891   /* allocate space for the transpose of the lower triangular factor */
892   PetscCall(PetscNew(&loTriFactorT));
893   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
894 
895   /* set the matrix descriptors of the lower triangular factor */
896   matrixType = cusparseGetMatType(loTriFactor->descr);
897   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
898   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
899     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
900   diagType = cusparseGetMatDiagType(loTriFactor->descr);
901 
902   /* Create the matrix description */
903   PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr));
904   PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase));
905   PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType));
906   PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode));
907   PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType));
908 
909   /* set the operation */
910   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
911 
912   /* allocate GPU space for the CSC of the lower triangular factor*/
913   loTriFactorT->csrMat = new CsrMatrix;
914   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
915   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
916   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
917   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
918   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
919   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
920 
921   /* compute the transpose of the lower triangular factor, i.e. the CSC */
922 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
923   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
924                                                loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
925                                                loTriFactor->csrMat->values->data().get(),
926                                                loTriFactor->csrMat->row_offsets->data().get(),
927                                                loTriFactor->csrMat->column_indices->data().get(),
928                                                loTriFactorT->csrMat->values->data().get(),
929                                                loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
930                                                CUSPARSE_ACTION_NUMERIC,indexBase,
931                                                CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize));
932   PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize));
933 #endif
934 
935   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
936   PetscCallCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
937                                   loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
938                                   loTriFactor->csrMat->values->data().get(),
939                                   loTriFactor->csrMat->row_offsets->data().get(),
940                                   loTriFactor->csrMat->column_indices->data().get(),
941                                   loTriFactorT->csrMat->values->data().get(),
942                                   #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
943                                   loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
944                                   CUSPARSE_ACTION_NUMERIC, indexBase,
945                                   CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer));
946                                   #else
947                                   loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
948                                   CUSPARSE_ACTION_NUMERIC, indexBase));
949                                   #endif
950   PetscCallCUDA(WaitForCUDA());
951   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
952 
953   /* Create the solve analysis information */
954   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
955   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactorT->solveInfo));
956 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
957   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
958                                          loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
959                                          loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
960                                          loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
961                                          &loTriFactorT->solveBufferSize));
962   PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize));
963 #endif
964 
965   /* perform the solve analysis */
966   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
967                                    loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
968                                    loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
969                                    loTriFactorT->csrMat->column_indices->data().get(),
970                                    #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
971                                    loTriFactorT->solveInfo,
972                                    loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
973                                    #else
974                                    loTriFactorT->solveInfo));
975                                    #endif
976   PetscCallCUDA(WaitForCUDA());
977   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
978 
979   /* assign the pointer */
980   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
981 
982   /*********************************************/
983   /* Now the Transpose of the Upper Tri Factor */
984   /*********************************************/
985 
986   /* allocate space for the transpose of the upper triangular factor */
987   PetscCall(PetscNew(&upTriFactorT));
988   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
989 
990   /* set the matrix descriptors of the upper triangular factor */
991   matrixType = cusparseGetMatType(upTriFactor->descr);
992   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
993   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
994     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
995   diagType = cusparseGetMatDiagType(upTriFactor->descr);
996 
997   /* Create the matrix description */
998   PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr));
999   PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase));
1000   PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType));
1001   PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode));
1002   PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType));
1003 
1004   /* set the operation */
1005   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1006 
1007   /* allocate GPU space for the CSC of the upper triangular factor*/
1008   upTriFactorT->csrMat = new CsrMatrix;
1009   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1010   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1011   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1012   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1013   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1014   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1015 
1016   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1017 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1018   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1019                                                upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1020                                                upTriFactor->csrMat->values->data().get(),
1021                                                upTriFactor->csrMat->row_offsets->data().get(),
1022                                                upTriFactor->csrMat->column_indices->data().get(),
1023                                                upTriFactorT->csrMat->values->data().get(),
1024                                                upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1025                                                CUSPARSE_ACTION_NUMERIC,indexBase,
1026                                                CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize));
1027   PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize));
1028 #endif
1029 
1030   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
1031   PetscCallCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1032                                   upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1033                                   upTriFactor->csrMat->values->data().get(),
1034                                   upTriFactor->csrMat->row_offsets->data().get(),
1035                                   upTriFactor->csrMat->column_indices->data().get(),
1036                                   upTriFactorT->csrMat->values->data().get(),
1037                                   #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1038                                   upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1039                                   CUSPARSE_ACTION_NUMERIC, indexBase,
1040                                   CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer));
1041                                   #else
1042                                   upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1043                                  CUSPARSE_ACTION_NUMERIC, indexBase));
1044                                  #endif
1045 
1046   PetscCallCUDA(WaitForCUDA());
1047   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
1048 
1049   /* Create the solve analysis information */
1050   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
1051   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactorT->solveInfo));
1052   #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1053   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1054                                          upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1055                                          upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1056                                          upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
1057                                          &upTriFactorT->solveBufferSize));
1058   PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize));
1059   #endif
1060 
1061   /* perform the solve analysis */
1062   /* christ, would it have killed you to put this stuff in a function????????? */
1063   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1064                                    upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1065                                    upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1066                                    upTriFactorT->csrMat->column_indices->data().get(),
1067                                    #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1068                                    upTriFactorT->solveInfo,
1069                                    upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1070                                    #else
1071                                    upTriFactorT->solveInfo));
1072                                    #endif
1073 
1074   PetscCallCUDA(WaitForCUDA());
1075   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
1076 
1077   /* assign the pointer */
1078   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1079   PetscFunctionReturn(0);
1080 }
1081 
1082 struct PetscScalarToPetscInt
1083 {
1084   __host__ __device__
1085   PetscInt operator()(PetscScalar s)
1086   {
1087     return (PetscInt)PetscRealPart(s);
1088   }
1089 };
1090 
1091 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A)
1092 {
1093   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1094   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1095   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1096   cusparseStatus_t             stat;
1097   cusparseIndexBase_t          indexBase;
1098 
1099   PetscFunctionBegin;
1100   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1101   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1102   PetscCheck(matstruct,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing mat struct");
1103   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1104   PetscCheck(!A->transupdated || matstructT,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing matTranspose struct");
1105   if (A->transupdated) PetscFunctionReturn(0);
1106   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
1107   PetscCall(PetscLogGpuTimeBegin());
1108   if (cusparsestruct->format != MAT_CUSPARSE_CSR) {
1109     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE));
1110   }
1111   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1112     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1113     PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr));
1114     indexBase = cusparseGetMatIndexBase(matstruct->descr);
1115     PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase));
1116     PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
1117 
1118     /* set alpha and beta */
1119     PetscCallCUDA(cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar)));
1120     PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar)));
1121     PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar)));
1122     PetscCallCUDA(cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
1123     PetscCallCUDA(cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice));
1124     PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
1125 
1126     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1127       CsrMatrix *matrixT = new CsrMatrix;
1128       matstructT->mat = matrixT;
1129       matrixT->num_rows = A->cmap->n;
1130       matrixT->num_cols = A->rmap->n;
1131       matrixT->num_entries = a->nz;
1132       matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1133       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1134       matrixT->values = new THRUSTARRAY(a->nz);
1135 
1136       if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); }
1137       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
1138 
1139      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1140       #if PETSC_PKG_CUDA_VERSION_GE(11,2,1)
1141         stat = cusparseCreateCsr(&matstructT->matDescr,
1142                                matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1143                                matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1144                                matrixT->values->data().get(),
1145                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1146                                indexBase,cusparse_scalartype);PetscCallCUSPARSE(stat);
1147       #else
1148         /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1,
1149            see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1
1150 
1151            I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set
1152            it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2,
1153            when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly.
1154         */
1155         if (matrixT->num_entries) {
1156           stat = cusparseCreateCsr(&matstructT->matDescr,
1157                                  matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1158                                  matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1159                                  matrixT->values->data().get(),
1160                                  CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I,
1161                                  indexBase,cusparse_scalartype);PetscCallCUSPARSE(stat);
1162 
1163         } else {
1164           matstructT->matDescr = NULL;
1165           matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
1166         }
1167       #endif
1168      #endif
1169     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1170    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1171       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1172    #else
1173       CsrMatrix *temp  = new CsrMatrix;
1174       CsrMatrix *tempT = new CsrMatrix;
1175       /* First convert HYB to CSR */
1176       temp->num_rows = A->rmap->n;
1177       temp->num_cols = A->cmap->n;
1178       temp->num_entries = a->nz;
1179       temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1180       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1181       temp->values = new THRUSTARRAY(a->nz);
1182 
1183       stat = cusparse_hyb2csr(cusparsestruct->handle,
1184                               matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1185                               temp->values->data().get(),
1186                               temp->row_offsets->data().get(),
1187                               temp->column_indices->data().get());PetscCallCUSPARSE(stat);
1188 
1189       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1190       tempT->num_rows = A->rmap->n;
1191       tempT->num_cols = A->cmap->n;
1192       tempT->num_entries = a->nz;
1193       tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1194       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1195       tempT->values = new THRUSTARRAY(a->nz);
1196 
1197       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1198                               temp->num_cols, temp->num_entries,
1199                               temp->values->data().get(),
1200                               temp->row_offsets->data().get(),
1201                               temp->column_indices->data().get(),
1202                               tempT->values->data().get(),
1203                               tempT->column_indices->data().get(),
1204                               tempT->row_offsets->data().get(),
1205                               CUSPARSE_ACTION_NUMERIC, indexBase);PetscCallCUSPARSE(stat);
1206 
1207       /* Last, convert CSC to HYB */
1208       cusparseHybMat_t hybMat;
1209       PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
1210       cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1211         CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1212       stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1213                               matstructT->descr, tempT->values->data().get(),
1214                               tempT->row_offsets->data().get(),
1215                               tempT->column_indices->data().get(),
1216                               hybMat, 0, partition);PetscCallCUSPARSE(stat);
1217 
1218       /* assign the pointer */
1219       matstructT->mat = hybMat;
1220       A->transupdated = PETSC_TRUE;
1221       /* delete temporaries */
1222       if (tempT) {
1223         if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1224         if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1225         if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1226         delete (CsrMatrix*) tempT;
1227       }
1228       if (temp) {
1229         if (temp->values) delete (THRUSTARRAY*) temp->values;
1230         if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1231         if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1232         delete (CsrMatrix*) temp;
1233       }
1234      #endif
1235     }
1236   }
1237   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1238     CsrMatrix *matrix  = (CsrMatrix*)matstruct->mat;
1239     CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat;
1240     PetscCheck(matrix,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix");
1241     PetscCheck(matrix->row_offsets,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix rows");
1242     PetscCheck(matrix->column_indices,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix cols");
1243     PetscCheck(matrix->values,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix values");
1244     PetscCheck(matrixT,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT");
1245     PetscCheck(matrixT->row_offsets,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT rows");
1246     PetscCheck(matrixT->column_indices,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT cols");
1247     PetscCheck(matrixT->values,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT values");
1248     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1249       cusparsestruct->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
1250       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
1251       PetscCall(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt)));
1252     }
1253     if (!cusparsestruct->csr2csc_i) {
1254       THRUSTARRAY csr2csc_a(matrix->num_entries);
1255       PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1256 
1257       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1258      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1259       void   *csr2cscBuffer;
1260       size_t csr2cscBufferSize;
1261       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1262                                            A->cmap->n, matrix->num_entries,
1263                                            matrix->values->data().get(),
1264                                            cusparsestruct->rowoffsets_gpu->data().get(),
1265                                            matrix->column_indices->data().get(),
1266                                            matrixT->values->data().get(),
1267                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1268                                            CUSPARSE_ACTION_NUMERIC,indexBase,
1269                                            cusparsestruct->csr2cscAlg, &csr2cscBufferSize);PetscCallCUSPARSE(stat);
1270       PetscCallCUDA(cudaMalloc(&csr2cscBuffer,csr2cscBufferSize));
1271      #endif
1272 
1273       if (matrix->num_entries) {
1274         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
1275            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
1276            I checked every parameters and they were just fine. I have no clue why cusparse complains.
1277 
1278            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
1279            should be filled with indexBase. So I just take a shortcut here.
1280         */
1281         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1282                               A->cmap->n,matrix->num_entries,
1283                               csr2csc_a.data().get(),
1284                               cusparsestruct->rowoffsets_gpu->data().get(),
1285                               matrix->column_indices->data().get(),
1286                               matrixT->values->data().get(),
1287                              #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1288                               matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1289                               CUSPARSE_ACTION_NUMERIC,indexBase,
1290                               cusparsestruct->csr2cscAlg, csr2cscBuffer);PetscCallCUSPARSE(stat);
1291                              #else
1292                               matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1293                               CUSPARSE_ACTION_NUMERIC, indexBase);PetscCallCUSPARSE(stat);
1294                              #endif
1295       } else {
1296         matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
1297       }
1298 
1299       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1300       PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt()));
1301      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1302       PetscCallCUDA(cudaFree(csr2cscBuffer));
1303      #endif
1304     }
1305     PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()),
1306                                                      thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()),
1307                                                      matrixT->values->begin()));
1308   }
1309   PetscCall(PetscLogGpuTimeEnd());
1310   PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
1311   /* the compressed row indices is not used for matTranspose */
1312   matstructT->cprowIndices = NULL;
1313   /* assign the pointer */
1314   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1315   A->transupdated = PETSC_TRUE;
1316   PetscFunctionReturn(0);
1317 }
1318 
1319 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1320 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1321 {
1322   PetscInt                              n = xx->map->n;
1323   const PetscScalar                     *barray;
1324   PetscScalar                           *xarray;
1325   thrust::device_ptr<const PetscScalar> bGPU;
1326   thrust::device_ptr<PetscScalar>       xGPU;
1327   cusparseStatus_t                      stat;
1328   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1329   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1330   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1331   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1332 
1333   PetscFunctionBegin;
1334   /* Analyze the matrix and create the transpose ... on the fly */
1335   if (!loTriFactorT && !upTriFactorT) {
1336     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1337     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1338     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1339   }
1340 
1341   /* Get the GPU pointers */
1342   PetscCall(VecCUDAGetArrayWrite(xx,&xarray));
1343   PetscCall(VecCUDAGetArrayRead(bb,&barray));
1344   xGPU = thrust::device_pointer_cast(xarray);
1345   bGPU = thrust::device_pointer_cast(barray);
1346 
1347   PetscCall(PetscLogGpuTimeBegin());
1348   /* First, reorder with the row permutation */
1349   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1350                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1351                xGPU);
1352 
1353   /* First, solve U */
1354   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1355                         upTriFactorT->csrMat->num_rows,
1356                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1357                         upTriFactorT->csrMat->num_entries,
1358                       #endif
1359                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1360                         upTriFactorT->csrMat->values->data().get(),
1361                         upTriFactorT->csrMat->row_offsets->data().get(),
1362                         upTriFactorT->csrMat->column_indices->data().get(),
1363                         upTriFactorT->solveInfo,
1364                         xarray,
1365                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1366                         tempGPU->data().get(),
1367                         upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);PetscCallCUSPARSE(stat);
1368                       #else
1369                         tempGPU->data().get());PetscCallCUSPARSE(stat);
1370                       #endif
1371 
1372   /* Then, solve L */
1373   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1374                         loTriFactorT->csrMat->num_rows,
1375                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1376                         loTriFactorT->csrMat->num_entries,
1377                       #endif
1378                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1379                         loTriFactorT->csrMat->values->data().get(),
1380                         loTriFactorT->csrMat->row_offsets->data().get(),
1381                         loTriFactorT->csrMat->column_indices->data().get(),
1382                         loTriFactorT->solveInfo,
1383                         tempGPU->data().get(),
1384                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1385                         xarray,
1386                         loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);PetscCallCUSPARSE(stat);
1387                       #else
1388                          xarray);PetscCallCUSPARSE(stat);
1389                       #endif
1390 
1391   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1392   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1393                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1394                tempGPU->begin());
1395 
1396   /* Copy the temporary to the full solution. */
1397   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU);
1398 
1399   /* restore */
1400   PetscCall(VecCUDARestoreArrayRead(bb,&barray));
1401   PetscCall(VecCUDARestoreArrayWrite(xx,&xarray));
1402   PetscCall(PetscLogGpuTimeEnd());
1403   PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n));
1404   PetscFunctionReturn(0);
1405 }
1406 
1407 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1408 {
1409   const PetscScalar                 *barray;
1410   PetscScalar                       *xarray;
1411   cusparseStatus_t                  stat;
1412   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1413   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1414   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1415   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1416 
1417   PetscFunctionBegin;
1418   /* Analyze the matrix and create the transpose ... on the fly */
1419   if (!loTriFactorT && !upTriFactorT) {
1420     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1421     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1422     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1423   }
1424 
1425   /* Get the GPU pointers */
1426   PetscCall(VecCUDAGetArrayWrite(xx,&xarray));
1427   PetscCall(VecCUDAGetArrayRead(bb,&barray));
1428 
1429   PetscCall(PetscLogGpuTimeBegin());
1430   /* First, solve U */
1431   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1432                         upTriFactorT->csrMat->num_rows,
1433                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1434                         upTriFactorT->csrMat->num_entries,
1435                       #endif
1436                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1437                         upTriFactorT->csrMat->values->data().get(),
1438                         upTriFactorT->csrMat->row_offsets->data().get(),
1439                         upTriFactorT->csrMat->column_indices->data().get(),
1440                         upTriFactorT->solveInfo,
1441                         barray,
1442                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1443                         tempGPU->data().get(),
1444                         upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);PetscCallCUSPARSE(stat);
1445                       #else
1446                         tempGPU->data().get());PetscCallCUSPARSE(stat);
1447                       #endif
1448 
1449   /* Then, solve L */
1450   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1451                         loTriFactorT->csrMat->num_rows,
1452                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1453                         loTriFactorT->csrMat->num_entries,
1454                       #endif
1455                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1456                         loTriFactorT->csrMat->values->data().get(),
1457                         loTriFactorT->csrMat->row_offsets->data().get(),
1458                         loTriFactorT->csrMat->column_indices->data().get(),
1459                         loTriFactorT->solveInfo,
1460                         tempGPU->data().get(),
1461                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1462                         xarray,
1463                         loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);PetscCallCUSPARSE(stat);
1464                       #else
1465                         xarray);PetscCallCUSPARSE(stat);
1466                       #endif
1467 
1468   /* restore */
1469   PetscCall(VecCUDARestoreArrayRead(bb,&barray));
1470   PetscCall(VecCUDARestoreArrayWrite(xx,&xarray));
1471   PetscCall(PetscLogGpuTimeEnd());
1472   PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n));
1473   PetscFunctionReturn(0);
1474 }
1475 
1476 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1477 {
1478   const PetscScalar                     *barray;
1479   PetscScalar                           *xarray;
1480   thrust::device_ptr<const PetscScalar> bGPU;
1481   thrust::device_ptr<PetscScalar>       xGPU;
1482   cusparseStatus_t                      stat;
1483   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1484   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1485   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1486   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1487 
1488   PetscFunctionBegin;
1489 
1490   /* Get the GPU pointers */
1491   PetscCall(VecCUDAGetArrayWrite(xx,&xarray));
1492   PetscCall(VecCUDAGetArrayRead(bb,&barray));
1493   xGPU = thrust::device_pointer_cast(xarray);
1494   bGPU = thrust::device_pointer_cast(barray);
1495 
1496   PetscCall(PetscLogGpuTimeBegin());
1497   /* First, reorder with the row permutation */
1498   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1499                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1500                tempGPU->begin());
1501 
1502   /* Next, solve L */
1503   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1504                         loTriFactor->csrMat->num_rows,
1505                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1506                         loTriFactor->csrMat->num_entries,
1507                       #endif
1508                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1509                         loTriFactor->csrMat->values->data().get(),
1510                         loTriFactor->csrMat->row_offsets->data().get(),
1511                         loTriFactor->csrMat->column_indices->data().get(),
1512                         loTriFactor->solveInfo,
1513                         tempGPU->data().get(),
1514                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1515                          xarray,
1516                          loTriFactor->solvePolicy, loTriFactor->solveBuffer);PetscCallCUSPARSE(stat);
1517                       #else
1518                          xarray);PetscCallCUSPARSE(stat);
1519                       #endif
1520 
1521   /* Then, solve U */
1522   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1523                         upTriFactor->csrMat->num_rows,
1524                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1525                         upTriFactor->csrMat->num_entries,
1526                       #endif
1527                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1528                         upTriFactor->csrMat->values->data().get(),
1529                         upTriFactor->csrMat->row_offsets->data().get(),
1530                         upTriFactor->csrMat->column_indices->data().get(),
1531                         upTriFactor->solveInfo,xarray,
1532                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1533                         tempGPU->data().get(),
1534                         upTriFactor->solvePolicy, upTriFactor->solveBuffer);PetscCallCUSPARSE(stat);
1535                       #else
1536                         tempGPU->data().get());PetscCallCUSPARSE(stat);
1537                       #endif
1538 
1539   /* Last, reorder with the column permutation */
1540   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1541                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
1542                xGPU);
1543 
1544   PetscCall(VecCUDARestoreArrayRead(bb,&barray));
1545   PetscCall(VecCUDARestoreArrayWrite(xx,&xarray));
1546   PetscCall(PetscLogGpuTimeEnd());
1547   PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n));
1548   PetscFunctionReturn(0);
1549 }
1550 
1551 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1552 {
1553   const PetscScalar                 *barray;
1554   PetscScalar                       *xarray;
1555   cusparseStatus_t                  stat;
1556   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1557   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1558   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1559   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1560 
1561   PetscFunctionBegin;
1562   /* Get the GPU pointers */
1563   PetscCall(VecCUDAGetArrayWrite(xx,&xarray));
1564   PetscCall(VecCUDAGetArrayRead(bb,&barray));
1565 
1566   PetscCall(PetscLogGpuTimeBegin());
1567   /* First, solve L */
1568   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1569                         loTriFactor->csrMat->num_rows,
1570                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1571                         loTriFactor->csrMat->num_entries,
1572                       #endif
1573                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1574                         loTriFactor->csrMat->values->data().get(),
1575                         loTriFactor->csrMat->row_offsets->data().get(),
1576                         loTriFactor->csrMat->column_indices->data().get(),
1577                         loTriFactor->solveInfo,
1578                         barray,
1579                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1580                         tempGPU->data().get(),
1581                         loTriFactor->solvePolicy,loTriFactor->solveBuffer);PetscCallCUSPARSE(stat);
1582                       #else
1583                         tempGPU->data().get());PetscCallCUSPARSE(stat);
1584                       #endif
1585 
1586   /* Next, solve U */
1587   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1588                         upTriFactor->csrMat->num_rows,
1589                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1590                         upTriFactor->csrMat->num_entries,
1591                       #endif
1592                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1593                         upTriFactor->csrMat->values->data().get(),
1594                         upTriFactor->csrMat->row_offsets->data().get(),
1595                         upTriFactor->csrMat->column_indices->data().get(),
1596                         upTriFactor->solveInfo,
1597                         tempGPU->data().get(),
1598                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1599                         xarray,
1600                         upTriFactor->solvePolicy, upTriFactor->solveBuffer);PetscCallCUSPARSE(stat);
1601                       #else
1602                         xarray);PetscCallCUSPARSE(stat);
1603                       #endif
1604 
1605   PetscCall(VecCUDARestoreArrayRead(bb,&barray));
1606   PetscCall(VecCUDARestoreArrayWrite(xx,&xarray));
1607   PetscCall(PetscLogGpuTimeEnd());
1608   PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n));
1609   PetscFunctionReturn(0);
1610 }
1611 
1612 #if CUSPARSE_VERSION >= 11500
1613 /* cusparseSpSV_solve() and friends first appeared in cusparse-11.3 */
1614 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ILU0(Mat fact,Vec b,Vec x)
1615 {
1616   Mat_SeqAIJCUSPARSETriFactors  *fs = (Mat_SeqAIJCUSPARSETriFactors*)fact->spptr;
1617   Mat_SeqAIJ                    *aij = (Mat_SeqAIJ*)fact->data;
1618   const PetscScalar             *barray;
1619   PetscScalar                   *xarray;
1620 
1621   PetscFunctionBegin;
1622   PetscCall(VecCUDAGetArrayWrite(x,&xarray));
1623   PetscCall(VecCUDAGetArrayRead(b,&barray));
1624   PetscCall(PetscLogGpuTimeBegin());
1625 
1626   /* Solve L*y = b */
1627   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X,(void*)barray));
1628   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y,fs->Y));
1629   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle,
1630                     CUSPARSE_OPERATION_NON_TRANSPOSE,
1631                     &PETSC_CUSPARSE_ONE,
1632                     fs->spMatDescr_L, /* L Y = X */
1633                     fs->dnVecDescr_X,
1634                     fs->dnVecDescr_Y,
1635                     cusparse_scalartype,
1636                     CUSPARSE_SPSV_ALG_DEFAULT,
1637                     fs->spsvDescr_L)); // cusparseSpSV_solve() scretely uses the external buffer used in cusparseSpSV_analysis()!
1638 
1639   /* Solve U*x = y */
1640   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X,xarray));
1641   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle,
1642                     CUSPARSE_OPERATION_NON_TRANSPOSE,
1643                     &PETSC_CUSPARSE_ONE,
1644                     fs->spMatDescr_U, /* U X = Y */
1645                     fs->dnVecDescr_Y,
1646                     fs->dnVecDescr_X,
1647                     cusparse_scalartype,
1648                     CUSPARSE_SPSV_ALG_DEFAULT,
1649                     fs->spsvDescr_U));
1650 
1651   PetscCall(VecCUDARestoreArrayRead(b,&barray));
1652   PetscCall(VecCUDARestoreArrayWrite(x,&xarray));
1653 
1654   PetscCall(PetscLogGpuTimeEnd());
1655   PetscCall(PetscLogGpuFlops(2.0*aij->nz - fact->rmap->n));
1656   PetscFunctionReturn(0);
1657 }
1658 
1659 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_ILU0(Mat fact,Vec b,Vec x)
1660 {
1661   Mat_SeqAIJCUSPARSETriFactors  *fs = (Mat_SeqAIJCUSPARSETriFactors*)fact->spptr;
1662   Mat_SeqAIJ                    *aij = (Mat_SeqAIJ*)fact->data;
1663   const PetscScalar             *barray;
1664   PetscScalar                   *xarray;
1665 
1666   PetscFunctionBegin;
1667   if (!fs->createdTransposeSpSVDescr) { /* Call MatSolveTranspose() for the first time */
1668     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
1669     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle,
1670                       CUSPARSE_OPERATION_TRANSPOSE,
1671                       &PETSC_CUSPARSE_ONE,
1672                       fs->spMatDescr_L, /* The matrix is still L. We only do tranpose solve with it */
1673                       fs->dnVecDescr_X,
1674                       fs->dnVecDescr_Y,
1675                       cusparse_scalartype,
1676                       CUSPARSE_SPSV_ALG_DEFAULT,
1677                       fs->spsvDescr_Lt,
1678                       &fs->spsvBufferSize_Lt));
1679 
1680     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut));
1681     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle,
1682                       CUSPARSE_OPERATION_TRANSPOSE,
1683                       &PETSC_CUSPARSE_ONE,
1684                       fs->spMatDescr_U,
1685                       fs->dnVecDescr_X,
1686                       fs->dnVecDescr_Y,
1687                       cusparse_scalartype,
1688                       CUSPARSE_SPSV_ALG_DEFAULT,
1689                       fs->spsvDescr_Ut,
1690                       &fs->spsvBufferSize_Ut));
1691     PetscCallCUDA(cudaMalloc((void**)&fs->spsvBuffer_Lt,fs->spsvBufferSize_Lt));
1692     PetscCallCUDA(cudaMalloc((void**)&fs->spsvBuffer_Ut,fs->spsvBufferSize_Ut));
1693     fs->createdTransposeSpSVDescr = PETSC_TRUE;
1694   }
1695 
1696   if (!fs->updatedTransposeSpSVAnalysis) {
1697     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle,
1698                     CUSPARSE_OPERATION_TRANSPOSE,
1699                     &PETSC_CUSPARSE_ONE,
1700                     fs->spMatDescr_L,
1701                     fs->dnVecDescr_X,
1702                     fs->dnVecDescr_Y,
1703                     cusparse_scalartype,
1704                     CUSPARSE_SPSV_ALG_DEFAULT,
1705                     fs->spsvDescr_Lt,
1706                     fs->spsvBuffer_Lt));
1707 
1708     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle,
1709                     CUSPARSE_OPERATION_TRANSPOSE,
1710                     &PETSC_CUSPARSE_ONE,
1711                     fs->spMatDescr_U,
1712                     fs->dnVecDescr_X,
1713                     fs->dnVecDescr_Y,
1714                     cusparse_scalartype,
1715                     CUSPARSE_SPSV_ALG_DEFAULT,
1716                     fs->spsvDescr_Ut,
1717                     fs->spsvBuffer_Ut));
1718     fs->updatedTransposeSpSVAnalysis = PETSC_TRUE;
1719   }
1720 
1721   PetscCall(VecCUDAGetArrayWrite(x,&xarray));
1722   PetscCall(VecCUDAGetArrayRead(b,&barray));
1723   PetscCall(PetscLogGpuTimeBegin());
1724 
1725   /* Solve Ut*y = b */
1726   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X,(void*)barray));
1727   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y,fs->Y));
1728   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle,
1729                     CUSPARSE_OPERATION_TRANSPOSE,
1730                     &PETSC_CUSPARSE_ONE,
1731                     fs->spMatDescr_U, /* Ut Y = X */
1732                     fs->dnVecDescr_X,
1733                     fs->dnVecDescr_Y,
1734                     cusparse_scalartype,
1735                     CUSPARSE_SPSV_ALG_DEFAULT,
1736                     fs->spsvDescr_Ut));
1737 
1738   /* Solve Lt*x = y */
1739   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X,xarray));
1740   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle,
1741                     CUSPARSE_OPERATION_TRANSPOSE,
1742                     &PETSC_CUSPARSE_ONE,
1743                     fs->spMatDescr_L, /* Lt X = Y */
1744                     fs->dnVecDescr_Y,
1745                     fs->dnVecDescr_X,
1746                     cusparse_scalartype,
1747                     CUSPARSE_SPSV_ALG_DEFAULT,
1748                     fs->spsvDescr_Lt));
1749 
1750   PetscCall(VecCUDARestoreArrayRead(b,&barray));
1751   PetscCall(VecCUDARestoreArrayWrite(x,&xarray));
1752   PetscCall(PetscLogGpuTimeEnd());
1753   PetscCall(PetscLogGpuFlops(2.0*aij->nz - fact->rmap->n));
1754   PetscFunctionReturn(0);
1755 }
1756 
1757 static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact,Mat A,const MatFactorInfo *info)
1758 {
1759   Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors*)fact->spptr;
1760   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ*)fact->data;
1761   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1762   CsrMatrix                    *Acsr;
1763   PetscInt                     m,nz;
1764   PetscBool                    flg;
1765 
1766   PetscFunctionBegin;
1767   if (PetscDefined(USE_DEBUG)) {
1768     PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
1769     PetscCheck(flg,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Expected MATSEQAIJCUSPARSE, but input is %s",((PetscObject)A)->type_name);
1770   }
1771 
1772   /* Copy A's value to fact */
1773   m  = fact->rmap->n;
1774   nz = aij->nz;
1775   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1776   Acsr = (CsrMatrix*)Acusp->mat->mat;
1777   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal,Acsr->values->data().get(),sizeof(PetscScalar)*nz,cudaMemcpyDeviceToDevice,PetscDefaultCudaStream));
1778 
1779   /* Factorize fact inplace */
1780   if (m) PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1781                     fs->matDescr_M,
1782                     fs->csrVal,
1783                     fs->csrRowPtr,
1784                     fs->csrColIdx,
1785                     fs->ilu0Info_M,
1786                     fs->policy_M,
1787                     fs->factBuffer_M));
1788   if (PetscDefined(USE_DEBUG)) {
1789     int              numerical_zero;
1790     cusparseStatus_t status;
1791     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero);
1792     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status,PETSC_COMM_SELF,PETSC_ERR_USER_INPUT,"Numerical zero pivot detected in csrilu02: A(%d,%d) is zero",numerical_zero,numerical_zero);
1793   }
1794 
1795   /* cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02()
1796      See discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/78
1797   */
1798   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle,
1799                     CUSPARSE_OPERATION_NON_TRANSPOSE,
1800                     &PETSC_CUSPARSE_ONE,
1801                     fs->spMatDescr_L,
1802                     fs->dnVecDescr_X,
1803                     fs->dnVecDescr_Y,
1804                     cusparse_scalartype,
1805                     CUSPARSE_SPSV_ALG_DEFAULT,
1806                     fs->spsvDescr_L,
1807                     fs->spsvBuffer_L));
1808 
1809   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle,
1810                     CUSPARSE_OPERATION_NON_TRANSPOSE,
1811                     &PETSC_CUSPARSE_ONE,
1812                     fs->spMatDescr_U,
1813                     fs->dnVecDescr_X,
1814                     fs->dnVecDescr_Y,
1815                     cusparse_scalartype,
1816                     CUSPARSE_SPSV_ALG_DEFAULT,
1817                     fs->spsvDescr_U,
1818                     fs->spsvBuffer_U));
1819 
1820   /* L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve */
1821   fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
1822 
1823   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1824   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_ILU0;
1825   fact->ops->solvetranspose    = MatSolveTranspose_SeqAIJCUSPARSE_ILU0;
1826   fact->ops->matsolve          = NULL;
1827   fact->ops->matsolvetranspose = NULL;
1828   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1829   PetscFunctionReturn(0);
1830 }
1831 
1832 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1833 {
1834   Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors*)fact->spptr;
1835   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ*)fact->data;
1836   PetscInt                     m,nz;
1837 
1838   PetscFunctionBegin;
1839   if (PetscDefined(USE_DEBUG)) {
1840     PetscInt  i;
1841     PetscBool flg,missing;
1842 
1843     PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
1844     PetscCheck(flg,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Expected MATSEQAIJCUSPARSE, but input is %s",((PetscObject)A)->type_name);
1845     PetscCheck(A->rmap->n == A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT,A->rmap->n,A->cmap->n);
1846     PetscCall(MatMissingDiagonal(A,&missing,&i));
1847     PetscCheck(!missing,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %" PetscInt_FMT,i);
1848   }
1849 
1850   /* Free the old stale stuff */
1851   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
1852 
1853   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1854      but they will not be used. Allocate them just for easy debugging.
1855    */
1856   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE/*malloc*/));
1857 
1858   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
1859   fact->factortype             = MAT_FACTOR_ILU;
1860   fact->info.factor_mallocs    = 0;
1861   fact->info.fill_ratio_given  = info->fill;
1862   fact->info.fill_ratio_needed = 1.0;
1863 
1864   aij->row = NULL;
1865   aij->col = NULL;
1866 
1867   /* ====================================================================== */
1868   /* Copy A's i, j to fact and also allocate the value array of fact.       */
1869   /* We'll do in-place factorization on fact                                */
1870   /* ====================================================================== */
1871   const int *Ai,*Aj;
1872 
1873   m  = fact->rmap->n;
1874   nz = aij->nz;
1875 
1876   PetscCallCUDA(cudaMalloc((void**)&fs->csrRowPtr,sizeof(int)*(m+1)));
1877   PetscCallCUDA(cudaMalloc((void**)&fs->csrColIdx,sizeof(int)*nz));
1878   PetscCallCUDA(cudaMalloc((void**)&fs->csrVal,sizeof(PetscScalar)*nz));
1879   PetscCall(MatSeqAIJCUSPARSEGetIJ(A,PETSC_FALSE,&Ai,&Aj));  /* Do not use compressed Ai */
1880   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr,Ai,sizeof(int)*(m+1),cudaMemcpyDeviceToDevice,PetscDefaultCudaStream));
1881   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx,Aj,sizeof(int)*nz,cudaMemcpyDeviceToDevice,PetscDefaultCudaStream));
1882 
1883   /* ====================================================================== */
1884   /* Create descriptors for M, L, U                                         */
1885   /* ====================================================================== */
1886   cusparseFillMode_t fillMode;
1887   cusparseDiagType_t diagType;
1888 
1889   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
1890   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
1891   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
1892 
1893   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
1894     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1895     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1896     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1897     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1898   */
1899   fillMode = CUSPARSE_FILL_MODE_LOWER;
1900   diagType = CUSPARSE_DIAG_TYPE_UNIT;
1901   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L,m,m,nz,
1902                     fs->csrRowPtr,
1903                     fs->csrColIdx,
1904                     fs->csrVal,
1905                     CUSPARSE_INDEX_32I,
1906                     CUSPARSE_INDEX_32I,
1907                     CUSPARSE_INDEX_BASE_ZERO,
1908                     cusparse_scalartype));
1909   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L,
1910                     CUSPARSE_SPMAT_FILL_MODE,
1911                     &fillMode,
1912                     sizeof(fillMode)));
1913   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L,
1914                     CUSPARSE_SPMAT_DIAG_TYPE,
1915                     &diagType,
1916                     sizeof(diagType)));
1917 
1918   fillMode = CUSPARSE_FILL_MODE_UPPER;
1919   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
1920   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U,m,m,nz,
1921                     fs->csrRowPtr,
1922                     fs->csrColIdx,
1923                     fs->csrVal,
1924                     CUSPARSE_INDEX_32I,
1925                     CUSPARSE_INDEX_32I,
1926                     CUSPARSE_INDEX_BASE_ZERO,
1927                     cusparse_scalartype));
1928   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U,
1929                     CUSPARSE_SPMAT_FILL_MODE,
1930                     &fillMode,
1931                     sizeof(fillMode)));
1932   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U,
1933                     CUSPARSE_SPMAT_DIAG_TYPE,
1934                     &diagType,
1935                     sizeof(diagType)));
1936 
1937   /* ========================================================================= */
1938   /* Query buffer sizes for csrilu0, SpSV and allocate buffers                 */
1939   /* ========================================================================= */
1940   PetscCallCUSPARSE(cusparseCreateCsrilu02Info(&fs->ilu0Info_M));
1941   if (m) PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1942                     fs->matDescr_M,
1943                     fs->csrVal,
1944                     fs->csrRowPtr,
1945                     fs->csrColIdx,
1946                     fs->ilu0Info_M,
1947                     &fs->factBufferSize_M));
1948 
1949   PetscCallCUDA(cudaMalloc((void**)&fs->X,sizeof(PetscScalar)*m));
1950   PetscCallCUDA(cudaMalloc((void**)&fs->Y,sizeof(PetscScalar)*m));
1951 
1952   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X,m,fs->X,cusparse_scalartype));
1953   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y,m,fs->Y,cusparse_scalartype));
1954 
1955   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
1956   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle,
1957                     CUSPARSE_OPERATION_NON_TRANSPOSE,
1958                     &PETSC_CUSPARSE_ONE,
1959                     fs->spMatDescr_L,
1960                     fs->dnVecDescr_X,
1961                     fs->dnVecDescr_Y,
1962                     cusparse_scalartype,
1963                     CUSPARSE_SPSV_ALG_DEFAULT,
1964                     fs->spsvDescr_L,
1965                     &fs->spsvBufferSize_L));
1966 
1967   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
1968   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle,
1969                     CUSPARSE_OPERATION_NON_TRANSPOSE,
1970                     &PETSC_CUSPARSE_ONE,
1971                     fs->spMatDescr_U,
1972                     fs->dnVecDescr_X,
1973                     fs->dnVecDescr_Y,
1974                     cusparse_scalartype,
1975                     CUSPARSE_SPSV_ALG_DEFAULT,
1976                     fs->spsvDescr_U,
1977                     &fs->spsvBufferSize_U));
1978 
1979   /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab,
1980      and discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/77,
1981      spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U.
1982      To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U.
1983    */
1984   if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) {
1985     PetscCallCUDA(cudaMalloc((void**)&fs->factBuffer_M,PetscMax(fs->spsvBufferSize_L,(size_t)fs->factBufferSize_M)));
1986     fs->spsvBuffer_L = fs->factBuffer_M;
1987     PetscCallCUDA(cudaMalloc((void**)&fs->spsvBuffer_U,fs->spsvBufferSize_U));
1988   } else {
1989     PetscCallCUDA(cudaMalloc((void**)&fs->factBuffer_M,PetscMax(fs->spsvBufferSize_U,(size_t)fs->factBufferSize_M)));
1990     fs->spsvBuffer_U = fs->factBuffer_M;
1991     PetscCallCUDA(cudaMalloc((void**)&fs->spsvBuffer_L,fs->spsvBufferSize_L));
1992   }
1993 
1994   /* ========================================================================== */
1995   /* Perform analysis of ilu0 on M, SpSv on L and U                             */
1996   /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/
1997   /* ========================================================================== */
1998   int              structural_zero;
1999   cusparseStatus_t status;
2000 
2001   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
2002   if (m) PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
2003                     fs->matDescr_M,
2004                     fs->csrVal,
2005                     fs->csrRowPtr,
2006                     fs->csrColIdx,
2007                     fs->ilu0Info_M,
2008                     fs->policy_M,
2009                     fs->factBuffer_M));
2010   if (PetscDefined(USE_DEBUG)) {
2011     /* Function cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
2012     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero);
2013     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status,PETSC_COMM_SELF,PETSC_ERR_USER_INPUT,"Structural zero pivot detected in csrilu02: A(%d,%d) is missing",structural_zero,structural_zero);
2014   }
2015 
2016   /* Estimate FLOPs of the numeric factorization */
2017  {
2018     Mat_SeqAIJ     *Aseq = (Mat_SeqAIJ*)A->data;
2019     PetscInt       *Ai,*Adiag,nzRow,nzLeft;
2020     PetscLogDouble flops = 0.0;
2021 
2022     PetscCall(MatMarkDiagonal_SeqAIJ(A));
2023     Ai    = Aseq->i;
2024     Adiag = Aseq->diag;
2025     for (PetscInt i=0; i<m; i++) {
2026       if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i+1]) { /* There are nonzeros left to the diagonal of row i */
2027         nzRow  = Ai[i+1] - Ai[i];
2028         nzLeft = Adiag[i] - Ai[i];
2029         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
2030           and include the eliminated one will be updated, which incurs a multiplication and an addition.
2031         */
2032         nzLeft = (nzRow-1)/2;
2033         flops += nzLeft*(2.0*nzRow-nzLeft+1);
2034       }
2035     }
2036     fs->numericFactFlops = flops;
2037   }
2038   fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0;
2039   PetscFunctionReturn(0);
2040 }
2041 
2042 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact,Vec b,Vec x)
2043 {
2044   Mat_SeqAIJCUSPARSETriFactors  *fs = (Mat_SeqAIJCUSPARSETriFactors*)fact->spptr;
2045   Mat_SeqAIJ                    *aij = (Mat_SeqAIJ*)fact->data;
2046   const PetscScalar             *barray;
2047   PetscScalar                   *xarray;
2048 
2049   PetscFunctionBegin;
2050   PetscCall(VecCUDAGetArrayWrite(x,&xarray));
2051   PetscCall(VecCUDAGetArrayRead(b,&barray));
2052   PetscCall(PetscLogGpuTimeBegin());
2053 
2054   /* Solve L*y = b */
2055   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X,(void*)barray));
2056   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y,fs->Y));
2057   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle,
2058                     CUSPARSE_OPERATION_NON_TRANSPOSE,
2059                     &PETSC_CUSPARSE_ONE,
2060                     fs->spMatDescr_L, /* L Y = X */
2061                     fs->dnVecDescr_X,
2062                     fs->dnVecDescr_Y,
2063                     cusparse_scalartype,
2064                     CUSPARSE_SPSV_ALG_DEFAULT,
2065                     fs->spsvDescr_L));
2066 
2067   /* Solve Lt*x = y */
2068   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X,xarray));
2069   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle,
2070                     CUSPARSE_OPERATION_TRANSPOSE,
2071                     &PETSC_CUSPARSE_ONE,
2072                     fs->spMatDescr_L, /* Lt X = Y */
2073                     fs->dnVecDescr_Y,
2074                     fs->dnVecDescr_X,
2075                     cusparse_scalartype,
2076                     CUSPARSE_SPSV_ALG_DEFAULT,
2077                     fs->spsvDescr_Lt));
2078 
2079   PetscCall(VecCUDARestoreArrayRead(b,&barray));
2080   PetscCall(VecCUDARestoreArrayWrite(x,&xarray));
2081 
2082   PetscCall(PetscLogGpuTimeEnd());
2083   PetscCall(PetscLogGpuFlops(2.0*aij->nz - fact->rmap->n));
2084   PetscFunctionReturn(0);
2085 }
2086 
2087 static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact,Mat A,const MatFactorInfo *info)
2088 {
2089   Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors*)fact->spptr;
2090   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ*)fact->data;
2091   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2092   CsrMatrix                    *Acsr;
2093   PetscInt                     m,nz;
2094   PetscBool                    flg;
2095 
2096   PetscFunctionBegin;
2097   if (PetscDefined(USE_DEBUG)) {
2098     PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
2099     PetscCheck(flg,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Expected MATSEQAIJCUSPARSE, but input is %s",((PetscObject)A)->type_name);
2100   }
2101 
2102   /* Copy A's value to fact */
2103   m  = fact->rmap->n;
2104   nz = aij->nz;
2105   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2106   Acsr = (CsrMatrix*)Acusp->mat->mat;
2107   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal,Acsr->values->data().get(),sizeof(PetscScalar)*nz,cudaMemcpyDeviceToDevice,PetscDefaultCudaStream));
2108 
2109   /* Factorize fact inplace */
2110   /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve
2111      Function csric02() only takes the lower triangular part of matrix A to perform factorization.
2112      The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored,
2113      and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not.
2114      In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided.
2115    */
2116   if (m) PetscCallCUSPARSE(cusparseXcsric02(fs->handle, m, nz,
2117                     fs->matDescr_M,
2118                     fs->csrVal,
2119                     fs->csrRowPtr,
2120                     fs->csrColIdx,
2121                     fs->ic0Info_M,
2122                     fs->policy_M,
2123                     fs->factBuffer_M));
2124   if (PetscDefined(USE_DEBUG)) {
2125     int              numerical_zero;
2126     cusparseStatus_t status;
2127     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero);
2128     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status,PETSC_COMM_SELF,PETSC_ERR_USER_INPUT,"Numerical zero pivot detected in csric02: A(%d,%d) is zero",numerical_zero,numerical_zero);
2129   }
2130 
2131   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle,
2132                     CUSPARSE_OPERATION_NON_TRANSPOSE,
2133                     &PETSC_CUSPARSE_ONE,
2134                     fs->spMatDescr_L,
2135                     fs->dnVecDescr_X,
2136                     fs->dnVecDescr_Y,
2137                     cusparse_scalartype,
2138                     CUSPARSE_SPSV_ALG_DEFAULT,
2139                     fs->spsvDescr_L,
2140                     fs->spsvBuffer_L));
2141 
2142   /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE
2143     ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F
2144   */
2145   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle,
2146                     CUSPARSE_OPERATION_TRANSPOSE,
2147                     &PETSC_CUSPARSE_ONE,
2148                     fs->spMatDescr_L,
2149                     fs->dnVecDescr_X,
2150                     fs->dnVecDescr_Y,
2151                     cusparse_scalartype,
2152                     CUSPARSE_SPSV_ALG_DEFAULT,
2153                     fs->spsvDescr_Lt,
2154                     fs->spsvBuffer_Lt));
2155 
2156   fact->offloadmask            = PETSC_OFFLOAD_GPU;
2157   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_ICC0;
2158   fact->ops->solvetranspose    = MatSolve_SeqAIJCUSPARSE_ICC0;
2159   fact->ops->matsolve          = NULL;
2160   fact->ops->matsolvetranspose = NULL;
2161   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
2162   PetscFunctionReturn(0);
2163 }
2164 
2165 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact,Mat A,IS perm,const MatFactorInfo *info)
2166 {
2167   Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors*)fact->spptr;
2168   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ*)fact->data;
2169   PetscInt                     m,nz;
2170 
2171   PetscFunctionBegin;
2172   if (PetscDefined(USE_DEBUG)) {
2173     PetscInt  i;
2174     PetscBool flg,missing;
2175 
2176     PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
2177     PetscCheck(flg,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Expected MATSEQAIJCUSPARSE, but input is %s",((PetscObject)A)->type_name);
2178     PetscCheck(A->rmap->n == A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT,A->rmap->n,A->cmap->n);
2179     PetscCall(MatMissingDiagonal(A,&missing,&i));
2180     PetscCheck(!missing,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %" PetscInt_FMT,i);
2181   }
2182 
2183   /* Free the old stale stuff */
2184   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
2185 
2186   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
2187      but they will not be used. Allocate them just for easy debugging.
2188    */
2189   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE/*malloc*/));
2190 
2191   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
2192   fact->factortype             = MAT_FACTOR_ICC;
2193   fact->info.factor_mallocs    = 0;
2194   fact->info.fill_ratio_given  = info->fill;
2195   fact->info.fill_ratio_needed = 1.0;
2196 
2197   aij->row = NULL;
2198   aij->col = NULL;
2199 
2200   /* ====================================================================== */
2201   /* Copy A's i, j to fact and also allocate the value array of fact.       */
2202   /* We'll do in-place factorization on fact                                */
2203   /* ====================================================================== */
2204   const int *Ai,*Aj;
2205 
2206   m  = fact->rmap->n;
2207   nz = aij->nz;
2208 
2209   PetscCallCUDA(cudaMalloc((void**)&fs->csrRowPtr,sizeof(int)*(m+1)));
2210   PetscCallCUDA(cudaMalloc((void**)&fs->csrColIdx,sizeof(int)*nz));
2211   PetscCallCUDA(cudaMalloc((void**)&fs->csrVal,sizeof(PetscScalar)*nz));
2212   PetscCall(MatSeqAIJCUSPARSEGetIJ(A,PETSC_FALSE,&Ai,&Aj));  /* Do not use compressed Ai */
2213   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr,Ai,sizeof(int)*(m+1),cudaMemcpyDeviceToDevice,PetscDefaultCudaStream));
2214   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx,Aj,sizeof(int)*nz,cudaMemcpyDeviceToDevice,PetscDefaultCudaStream));
2215 
2216   /* ====================================================================== */
2217   /* Create mat descriptors for M, L                                        */
2218   /* ====================================================================== */
2219   cusparseFillMode_t fillMode;
2220   cusparseDiagType_t diagType;
2221 
2222   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
2223   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
2224   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
2225 
2226   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
2227     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
2228     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
2229     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
2230     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
2231   */
2232   fillMode = CUSPARSE_FILL_MODE_LOWER;
2233   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
2234   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L,m,m,nz,
2235                     fs->csrRowPtr,
2236                     fs->csrColIdx,
2237                     fs->csrVal,
2238                     CUSPARSE_INDEX_32I,
2239                     CUSPARSE_INDEX_32I,
2240                     CUSPARSE_INDEX_BASE_ZERO,
2241                     cusparse_scalartype));
2242   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L,
2243                     CUSPARSE_SPMAT_FILL_MODE,
2244                     &fillMode,
2245                     sizeof(fillMode)));
2246   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L,
2247                     CUSPARSE_SPMAT_DIAG_TYPE,
2248                     &diagType,
2249                     sizeof(diagType)));
2250 
2251   /* ========================================================================= */
2252   /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers     */
2253   /* ========================================================================= */
2254   PetscCallCUSPARSE(cusparseCreateCsric02Info(&fs->ic0Info_M));
2255   if (m) PetscCallCUSPARSE(cusparseXcsric02_bufferSize(fs->handle, m, nz,
2256                     fs->matDescr_M,
2257                     fs->csrVal,
2258                     fs->csrRowPtr,
2259                     fs->csrColIdx,
2260                     fs->ic0Info_M,
2261                     &fs->factBufferSize_M));
2262 
2263   PetscCallCUDA(cudaMalloc((void**)&fs->X,sizeof(PetscScalar)*m));
2264   PetscCallCUDA(cudaMalloc((void**)&fs->Y,sizeof(PetscScalar)*m));
2265 
2266   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X,m,fs->X,cusparse_scalartype));
2267   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y,m,fs->Y,cusparse_scalartype));
2268 
2269   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
2270   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle,
2271                     CUSPARSE_OPERATION_NON_TRANSPOSE,
2272                     &PETSC_CUSPARSE_ONE,
2273                     fs->spMatDescr_L,
2274                     fs->dnVecDescr_X,
2275                     fs->dnVecDescr_Y,
2276                     cusparse_scalartype,
2277                     CUSPARSE_SPSV_ALG_DEFAULT,
2278                     fs->spsvDescr_L,
2279                     &fs->spsvBufferSize_L));
2280 
2281   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
2282   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle,
2283                     CUSPARSE_OPERATION_TRANSPOSE,
2284                     &PETSC_CUSPARSE_ONE,
2285                     fs->spMatDescr_L,
2286                     fs->dnVecDescr_X,
2287                     fs->dnVecDescr_Y,
2288                     cusparse_scalartype,
2289                     CUSPARSE_SPSV_ALG_DEFAULT,
2290                     fs->spsvDescr_Lt,
2291                     &fs->spsvBufferSize_Lt));
2292 
2293   /* To save device memory, we make the factorization buffer share with one of the solver buffer.
2294      See also comments in MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0().
2295    */
2296   if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) {
2297     PetscCallCUDA(cudaMalloc((void**)&fs->factBuffer_M,PetscMax(fs->spsvBufferSize_L,(size_t)fs->factBufferSize_M)));
2298     fs->spsvBuffer_L = fs->factBuffer_M;
2299     PetscCallCUDA(cudaMalloc((void**)&fs->spsvBuffer_Lt,fs->spsvBufferSize_Lt));
2300   } else {
2301     PetscCallCUDA(cudaMalloc((void**)&fs->factBuffer_M,PetscMax(fs->spsvBufferSize_Lt,(size_t)fs->factBufferSize_M)));
2302     fs->spsvBuffer_Lt = fs->factBuffer_M;
2303     PetscCallCUDA(cudaMalloc((void**)&fs->spsvBuffer_L,fs->spsvBufferSize_L));
2304   }
2305 
2306   /* ========================================================================== */
2307   /* Perform analysis of ic0 on M                                               */
2308   /* The lower triangular part of M has the same sparsity pattern as L          */
2309   /* ========================================================================== */
2310   int              structural_zero;
2311   cusparseStatus_t status;
2312 
2313   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
2314   if (m) PetscCallCUSPARSE(cusparseXcsric02_analysis(fs->handle, m, nz,
2315                     fs->matDescr_M,
2316                     fs->csrVal,
2317                     fs->csrRowPtr,
2318                     fs->csrColIdx,
2319                     fs->ic0Info_M,
2320                     fs->policy_M,
2321                     fs->factBuffer_M));
2322   if (PetscDefined(USE_DEBUG)) {
2323     /* Function cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
2324     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero);
2325     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status,PETSC_COMM_SELF,PETSC_ERR_USER_INPUT,"Structural zero pivot detected in csric02: A(%d,%d) is missing",structural_zero,structural_zero);
2326   }
2327 
2328   /* Estimate FLOPs of the numeric factorization */
2329   {
2330     Mat_SeqAIJ     *Aseq = (Mat_SeqAIJ*)A->data;
2331     PetscInt       *Ai,nzRow,nzLeft;
2332     PetscLogDouble flops = 0.0;
2333 
2334     Ai = Aseq->i;
2335     for (PetscInt i=0; i<m; i++) {
2336       nzRow = Ai[i+1] - Ai[i];
2337       if (nzRow > 1) {
2338         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
2339           and include the eliminated one will be updated, which incurs a multiplication and an addition.
2340         */
2341         nzLeft = (nzRow-1)/2;
2342         flops += nzLeft*(2.0*nzRow-nzLeft+1);
2343       }
2344     }
2345     fs->numericFactFlops = flops;
2346   }
2347   fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0;
2348   PetscFunctionReturn(0);
2349 }
2350 #endif
2351 
2352 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
2353 {
2354   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
2355 
2356   PetscFunctionBegin;
2357  #if CUSPARSE_VERSION >= 11500
2358   PetscBool row_identity,col_identity;
2359   PetscCall(ISIdentity(isrow,&row_identity));
2360   PetscCall(ISIdentity(iscol,&col_identity));
2361   if (!info->levels && row_identity && col_identity) {
2362     PetscCall(MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B,A,isrow,iscol,info));
2363   } else
2364  #endif
2365   {
2366     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2367     PetscCall(MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info));
2368     B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2369   }
2370   PetscFunctionReturn(0);
2371 }
2372 
2373 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
2374 {
2375   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
2376 
2377   PetscFunctionBegin;
2378   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2379   PetscCall(MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info));
2380   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2381   PetscFunctionReturn(0);
2382 }
2383 
2384 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
2385 {
2386   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
2387 
2388   PetscFunctionBegin;
2389  #if CUSPARSE_VERSION >= 11500
2390   PetscBool perm_identity;
2391   PetscCall(ISIdentity(perm,&perm_identity));
2392   if (!info->levels && perm_identity) {
2393     PetscCall(MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B,A,perm,info));
2394   } else
2395  #endif
2396   {
2397     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2398     PetscCall(MatICCFactorSymbolic_SeqAIJ(B,A,perm,info));
2399     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
2400   }
2401   PetscFunctionReturn(0);
2402 }
2403 
2404 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
2405 {
2406   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
2407 
2408   PetscFunctionBegin;
2409   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2410   PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info));
2411   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
2412   PetscFunctionReturn(0);
2413 }
2414 
2415 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
2416 {
2417   PetscFunctionBegin;
2418   *type = MATSOLVERCUSPARSE;
2419   PetscFunctionReturn(0);
2420 }
2421 
2422 /*MC
2423   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
2424   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
2425   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
2426   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
2427   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
2428   algorithms are not recommended. This class does NOT support direct solver operations.
2429 
2430   Level: beginner
2431 
2432 .seealso: `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
2433 M*/
2434 
2435 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
2436 {
2437   PetscInt       n = A->rmap->n;
2438 
2439   PetscFunctionBegin;
2440   PetscCall(MatCreate(PetscObjectComm((PetscObject)A),B));
2441   PetscCall(MatSetSizes(*B,n,n,n,n));
2442   (*B)->factortype = ftype;
2443   PetscCall(MatSetType(*B,MATSEQAIJCUSPARSE));
2444 
2445   if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B,PETSC_TRUE));
2446   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
2447     PetscCall(MatSetBlockSizesFromMats(*B,A,A));
2448     if (!A->boundtocpu) {
2449       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
2450       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
2451     } else {
2452       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
2453       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJ;
2454     }
2455     PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]));
2456     PetscCall(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU]));
2457     PetscCall(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
2458   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
2459     if (!A->boundtocpu) {
2460       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
2461       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
2462     } else {
2463       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJ;
2464       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
2465     }
2466     PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
2467     PetscCall(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC]));
2468   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
2469 
2470   PetscCall(MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL));
2471   (*B)->canuseordering = PETSC_TRUE;
2472   PetscCall(PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse));
2473   PetscFunctionReturn(0);
2474 }
2475 
2476 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
2477 {
2478   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
2479   Mat_SeqAIJCUSPARSE           *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2480  #if CUSPARSE_VERSION >= 13500
2481   Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
2482  #endif
2483 
2484   PetscFunctionBegin;
2485   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
2486     PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0));
2487     if (A->factortype == MAT_FACTOR_NONE) {
2488       CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat;
2489       PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2490     }
2491    #if CUSPARSE_VERSION >= 13500
2492     else if (fs->csrVal) {
2493       /* We have a factorized matrix on device and are able to copy it to host */
2494       PetscCallCUDA(cudaMemcpy(a->a, fs->csrVal, a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2495     }
2496    #endif
2497     else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for copying this type of factorized matrix from device to host");
2498     PetscCall(PetscLogGpuToCpu(a->nz*sizeof(PetscScalar)));
2499     PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0));
2500     A->offloadmask = PETSC_OFFLOAD_BOTH;
2501   }
2502   PetscFunctionReturn(0);
2503 }
2504 
2505 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
2506 {
2507   PetscFunctionBegin;
2508   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2509   *array = ((Mat_SeqAIJ*)A->data)->a;
2510   PetscFunctionReturn(0);
2511 }
2512 
2513 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
2514 {
2515   PetscFunctionBegin;
2516   A->offloadmask = PETSC_OFFLOAD_CPU;
2517   *array         = NULL;
2518   PetscFunctionReturn(0);
2519 }
2520 
2521 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[])
2522 {
2523   PetscFunctionBegin;
2524   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2525   *array = ((Mat_SeqAIJ*)A->data)->a;
2526   PetscFunctionReturn(0);
2527 }
2528 
2529 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[])
2530 {
2531   PetscFunctionBegin;
2532   *array = NULL;
2533   PetscFunctionReturn(0);
2534 }
2535 
2536 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
2537 {
2538   PetscFunctionBegin;
2539   *array = ((Mat_SeqAIJ*)A->data)->a;
2540   PetscFunctionReturn(0);
2541 }
2542 
2543 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
2544 {
2545   PetscFunctionBegin;
2546   A->offloadmask = PETSC_OFFLOAD_CPU;
2547   *array         = NULL;
2548   PetscFunctionReturn(0);
2549 }
2550 
2551 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A,const PetscInt **i,const PetscInt **j,PetscScalar **a,PetscMemType *mtype)
2552 {
2553   Mat_SeqAIJCUSPARSE           *cusp;
2554   CsrMatrix                    *matrix;
2555 
2556   PetscFunctionBegin;
2557   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2558   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
2559   cusp = static_cast<Mat_SeqAIJCUSPARSE*>(A->spptr);
2560   PetscCheck(cusp != NULL,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"cusp is NULL");
2561   matrix = (CsrMatrix*)cusp->mat->mat;
2562 
2563   if (i) {
2564    #if !defined(PETSC_USE_64BIT_INDICES)
2565     *i = matrix->row_offsets->data().get();
2566    #else
2567     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSparse does not supported 64-bit indices");
2568    #endif
2569   }
2570   if (j) {
2571    #if !defined(PETSC_USE_64BIT_INDICES)
2572     *j = matrix->column_indices->data().get();
2573    #else
2574     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSparse does not supported 64-bit indices");
2575    #endif
2576   }
2577   if (a) *a = matrix->values->data().get();
2578   if (mtype) *mtype = PETSC_MEMTYPE_CUDA;
2579   PetscFunctionReturn(0);
2580 }
2581 
2582 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
2583 {
2584   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
2585   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
2586   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
2587   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
2588   cusparseStatus_t             stat;
2589   PetscBool                    both = PETSC_TRUE;
2590 
2591   PetscFunctionBegin;
2592   PetscCheck(!A->boundtocpu,PETSC_COMM_SELF,PETSC_ERR_GPU,"Cannot copy to GPU");
2593   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
2594     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
2595       CsrMatrix *matrix;
2596       matrix = (CsrMatrix*)cusparsestruct->mat->mat;
2597 
2598       PetscCheck(!a->nz || a->a,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR values");
2599       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0));
2600       matrix->values->assign(a->a, a->a+a->nz);
2601       PetscCallCUDA(WaitForCUDA());
2602       PetscCall(PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar)));
2603       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0));
2604       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE));
2605     } else {
2606       PetscInt nnz;
2607       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0));
2608       PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format));
2609       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE));
2610       delete cusparsestruct->workVector;
2611       delete cusparsestruct->rowoffsets_gpu;
2612       cusparsestruct->workVector = NULL;
2613       cusparsestruct->rowoffsets_gpu = NULL;
2614       try {
2615         if (a->compressedrow.use) {
2616           m    = a->compressedrow.nrows;
2617           ii   = a->compressedrow.i;
2618           ridx = a->compressedrow.rindex;
2619         } else {
2620           m    = A->rmap->n;
2621           ii   = a->i;
2622           ridx = NULL;
2623         }
2624         PetscCheck(ii,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR row data");
2625         if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; }
2626         else nnz = a->nz;
2627         PetscCheck(!nnz || a->j,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR column data");
2628 
2629         /* create cusparse matrix */
2630         cusparsestruct->nrows = m;
2631         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
2632         PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr));
2633         PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO));
2634         PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
2635 
2636         PetscCallCUDA(cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar)));
2637         PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar)));
2638         PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar)));
2639         PetscCallCUDA(cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
2640         PetscCallCUDA(cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice));
2641         PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
2642         PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE));
2643 
2644         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
2645         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
2646           /* set the matrix */
2647           CsrMatrix *mat= new CsrMatrix;
2648           mat->num_rows = m;
2649           mat->num_cols = A->cmap->n;
2650           mat->num_entries = nnz;
2651           mat->row_offsets = new THRUSTINTARRAY32(m+1);
2652           mat->row_offsets->assign(ii, ii + m+1);
2653 
2654           mat->column_indices = new THRUSTINTARRAY32(nnz);
2655           mat->column_indices->assign(a->j, a->j+nnz);
2656 
2657           mat->values = new THRUSTARRAY(nnz);
2658           if (a->a) mat->values->assign(a->a, a->a+nnz);
2659 
2660           /* assign the pointer */
2661           matstruct->mat = mat;
2662          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2663           if (mat->num_rows) { /* cusparse errors on empty matrices! */
2664             stat = cusparseCreateCsr(&matstruct->matDescr,
2665                                     mat->num_rows, mat->num_cols, mat->num_entries,
2666                                     mat->row_offsets->data().get(), mat->column_indices->data().get(),
2667                                     mat->values->data().get(),
2668                                     CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2669                                     CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);PetscCallCUSPARSE(stat);
2670           }
2671          #endif
2672         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
2673          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2674           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2675          #else
2676           CsrMatrix *mat= new CsrMatrix;
2677           mat->num_rows = m;
2678           mat->num_cols = A->cmap->n;
2679           mat->num_entries = nnz;
2680           mat->row_offsets = new THRUSTINTARRAY32(m+1);
2681           mat->row_offsets->assign(ii, ii + m+1);
2682 
2683           mat->column_indices = new THRUSTINTARRAY32(nnz);
2684           mat->column_indices->assign(a->j, a->j+nnz);
2685 
2686           mat->values = new THRUSTARRAY(nnz);
2687           if (a->a) mat->values->assign(a->a, a->a+nnz);
2688 
2689           cusparseHybMat_t hybMat;
2690           PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
2691           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
2692             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
2693           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
2694               matstruct->descr, mat->values->data().get(),
2695               mat->row_offsets->data().get(),
2696               mat->column_indices->data().get(),
2697               hybMat, 0, partition);PetscCallCUSPARSE(stat);
2698           /* assign the pointer */
2699           matstruct->mat = hybMat;
2700 
2701           if (mat) {
2702             if (mat->values) delete (THRUSTARRAY*)mat->values;
2703             if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
2704             if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
2705             delete (CsrMatrix*)mat;
2706           }
2707          #endif
2708         }
2709 
2710         /* assign the compressed row indices */
2711         if (a->compressedrow.use) {
2712           cusparsestruct->workVector = new THRUSTARRAY(m);
2713           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
2714           matstruct->cprowIndices->assign(ridx,ridx+m);
2715           tmp = m;
2716         } else {
2717           cusparsestruct->workVector = NULL;
2718           matstruct->cprowIndices    = NULL;
2719           tmp = 0;
2720         }
2721         PetscCall(PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar)));
2722 
2723         /* assign the pointer */
2724         cusparsestruct->mat = matstruct;
2725       } catch(char *ex) {
2726         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
2727       }
2728       PetscCallCUDA(WaitForCUDA());
2729       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0));
2730       cusparsestruct->nonzerostate = A->nonzerostate;
2731     }
2732     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
2733   }
2734   PetscFunctionReturn(0);
2735 }
2736 
2737 struct VecCUDAPlusEquals
2738 {
2739   template <typename Tuple>
2740   __host__ __device__
2741   void operator()(Tuple t)
2742   {
2743     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
2744   }
2745 };
2746 
2747 struct VecCUDAEquals
2748 {
2749   template <typename Tuple>
2750   __host__ __device__
2751   void operator()(Tuple t)
2752   {
2753     thrust::get<1>(t) = thrust::get<0>(t);
2754   }
2755 };
2756 
2757 struct VecCUDAEqualsReverse
2758 {
2759   template <typename Tuple>
2760   __host__ __device__
2761   void operator()(Tuple t)
2762   {
2763     thrust::get<0>(t) = thrust::get<1>(t);
2764   }
2765 };
2766 
2767 struct MatMatCusparse {
2768   PetscBool             cisdense;
2769   PetscScalar           *Bt;
2770   Mat                   X;
2771   PetscBool             reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
2772   PetscLogDouble        flops;
2773   CsrMatrix             *Bcsr;
2774 
2775 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2776   cusparseSpMatDescr_t  matSpBDescr;
2777   PetscBool             initialized;   /* C = alpha op(A) op(B) + beta C */
2778   cusparseDnMatDescr_t  matBDescr;
2779   cusparseDnMatDescr_t  matCDescr;
2780   PetscInt              Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
2781  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2782   void                  *dBuffer4;
2783   void                  *dBuffer5;
2784  #endif
2785   size_t                mmBufferSize;
2786   void                  *mmBuffer;
2787   void                  *mmBuffer2; /* SpGEMM WorkEstimation buffer */
2788   cusparseSpGEMMDescr_t spgemmDesc;
2789 #endif
2790 };
2791 
2792 static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
2793 {
2794   MatMatCusparse *mmdata = (MatMatCusparse *)data;
2795 
2796   PetscFunctionBegin;
2797   PetscCallCUDA(cudaFree(mmdata->Bt));
2798   delete mmdata->Bcsr;
2799  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2800   if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr));
2801   if (mmdata->matBDescr)   PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
2802   if (mmdata->matCDescr)   PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
2803   if (mmdata->spgemmDesc)  PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc));
2804  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2805   if (mmdata->dBuffer4)  PetscCallCUDA(cudaFree(mmdata->dBuffer4));
2806   if (mmdata->dBuffer5)  PetscCallCUDA(cudaFree(mmdata->dBuffer5));
2807  #endif
2808   if (mmdata->mmBuffer)  PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2809   if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2));
2810  #endif
2811   PetscCall(MatDestroy(&mmdata->X));
2812   PetscCall(PetscFree(data));
2813   PetscFunctionReturn(0);
2814 }
2815 
2816 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
2817 
2818 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2819 {
2820   Mat_Product                  *product = C->product;
2821   Mat                          A,B;
2822   PetscInt                     m,n,blda,clda;
2823   PetscBool                    flg,biscuda;
2824   Mat_SeqAIJCUSPARSE           *cusp;
2825   cusparseStatus_t             stat;
2826   cusparseOperation_t          opA;
2827   const PetscScalar            *barray;
2828   PetscScalar                  *carray;
2829   MatMatCusparse               *mmdata;
2830   Mat_SeqAIJCUSPARSEMultStruct *mat;
2831   CsrMatrix                    *csrmat;
2832 
2833   PetscFunctionBegin;
2834   MatCheckProduct(C,1);
2835   PetscCheck(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
2836   mmdata = (MatMatCusparse*)product->data;
2837   A    = product->A;
2838   B    = product->B;
2839   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
2840   PetscCheck(flg,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2841   /* currently CopyToGpu does not copy if the matrix is bound to CPU
2842      Instead of silently accepting the wrong answer, I prefer to raise the error */
2843   PetscCheck(!A->boundtocpu,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2844   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2845   cusp   = (Mat_SeqAIJCUSPARSE*)A->spptr;
2846   switch (product->type) {
2847   case MATPRODUCT_AB:
2848   case MATPRODUCT_PtAP:
2849     mat = cusp->mat;
2850     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2851     m   = A->rmap->n;
2852     n   = B->cmap->n;
2853     break;
2854   case MATPRODUCT_AtB:
2855     if (!A->form_explicit_transpose) {
2856       mat = cusp->mat;
2857       opA = CUSPARSE_OPERATION_TRANSPOSE;
2858     } else {
2859       PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2860       mat  = cusp->matTranspose;
2861       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
2862     }
2863     m = A->cmap->n;
2864     n = B->cmap->n;
2865     break;
2866   case MATPRODUCT_ABt:
2867   case MATPRODUCT_RARt:
2868     mat = cusp->mat;
2869     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2870     m   = A->rmap->n;
2871     n   = B->rmap->n;
2872     break;
2873   default:
2874     SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2875   }
2876   PetscCheck(mat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing Mat_SeqAIJCUSPARSEMultStruct");
2877   csrmat = (CsrMatrix*)mat->mat;
2878   /* if the user passed a CPU matrix, copy the data to the GPU */
2879   PetscCall(PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda));
2880   if (!biscuda) PetscCall(MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B));
2881   PetscCall(MatDenseCUDAGetArrayRead(B,&barray));
2882 
2883   PetscCall(MatDenseGetLDA(B,&blda));
2884   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2885     PetscCall(MatDenseCUDAGetArrayWrite(mmdata->X,&carray));
2886     PetscCall(MatDenseGetLDA(mmdata->X,&clda));
2887   } else {
2888     PetscCall(MatDenseCUDAGetArrayWrite(C,&carray));
2889     PetscCall(MatDenseGetLDA(C,&clda));
2890   }
2891 
2892   PetscCall(PetscLogGpuTimeBegin());
2893  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2894   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2895   /* (re)allocate mmBuffer if not initialized or LDAs are different */
2896   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2897     size_t mmBufferSize;
2898     if (mmdata->initialized && mmdata->Blda != blda) {PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); mmdata->matBDescr = NULL;}
2899     if (!mmdata->matBDescr) {
2900       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL));
2901       mmdata->Blda = blda;
2902     }
2903 
2904     if (mmdata->initialized && mmdata->Clda != clda) {PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); mmdata->matCDescr = NULL;}
2905     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2906       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL));
2907       mmdata->Clda = clda;
2908     }
2909 
2910     if (!mat->matDescr) {
2911       stat = cusparseCreateCsr(&mat->matDescr,
2912                                csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
2913                                csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
2914                                csrmat->values->data().get(),
2915                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2916                                CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);PetscCallCUSPARSE(stat);
2917     }
2918     stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
2919                                    mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2920                                    mmdata->matCDescr,cusparse_scalartype,
2921                                    cusp->spmmAlg,&mmBufferSize);PetscCallCUSPARSE(stat);
2922     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2923       PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2924       PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer,mmBufferSize));
2925       mmdata->mmBufferSize = mmBufferSize;
2926     }
2927     mmdata->initialized = PETSC_TRUE;
2928   } else {
2929     /* to be safe, always update pointers of the mats */
2930     PetscCallCUSPARSE(cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get()));
2931     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray));
2932     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray));
2933   }
2934 
2935   /* do cusparseSpMM, which supports transpose on B */
2936   stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
2937                       mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2938                       mmdata->matCDescr,cusparse_scalartype,
2939                       cusp->spmmAlg,mmdata->mmBuffer);PetscCallCUSPARSE(stat);
2940  #else
2941   PetscInt k;
2942   /* cusparseXcsrmm does not support transpose on B */
2943   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2944     cublasHandle_t cublasv2handle;
2945     cublasStatus_t cerr;
2946 
2947     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
2948     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
2949                        B->cmap->n,B->rmap->n,
2950                        &PETSC_CUSPARSE_ONE ,barray,blda,
2951                        &PETSC_CUSPARSE_ZERO,barray,blda,
2952                        mmdata->Bt,B->cmap->n);PetscCallCUBLAS(cerr);
2953     blda = B->cmap->n;
2954     k    = B->cmap->n;
2955   } else {
2956     k    = B->rmap->n;
2957   }
2958 
2959   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2960   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
2961                            csrmat->num_entries,mat->alpha_one,mat->descr,
2962                            csrmat->values->data().get(),
2963                            csrmat->row_offsets->data().get(),
2964                            csrmat->column_indices->data().get(),
2965                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
2966                            carray,clda);PetscCallCUSPARSE(stat);
2967  #endif
2968   PetscCall(PetscLogGpuTimeEnd());
2969   PetscCall(PetscLogGpuFlops(n*2.0*csrmat->num_entries));
2970   PetscCall(MatDenseCUDARestoreArrayRead(B,&barray));
2971   if (product->type == MATPRODUCT_RARt) {
2972     PetscCall(MatDenseCUDARestoreArrayWrite(mmdata->X,&carray));
2973     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE));
2974   } else if (product->type == MATPRODUCT_PtAP) {
2975     PetscCall(MatDenseCUDARestoreArrayWrite(mmdata->X,&carray));
2976     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE));
2977   } else {
2978     PetscCall(MatDenseCUDARestoreArrayWrite(C,&carray));
2979   }
2980   if (mmdata->cisdense) {
2981     PetscCall(MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C));
2982   }
2983   if (!biscuda) {
2984     PetscCall(MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B));
2985   }
2986   PetscFunctionReturn(0);
2987 }
2988 
2989 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2990 {
2991   Mat_Product        *product = C->product;
2992   Mat                A,B;
2993   PetscInt           m,n;
2994   PetscBool          cisdense,flg;
2995   MatMatCusparse     *mmdata;
2996   Mat_SeqAIJCUSPARSE *cusp;
2997 
2998   PetscFunctionBegin;
2999   MatCheckProduct(C,1);
3000   PetscCheck(!C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
3001   A    = product->A;
3002   B    = product->B;
3003   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
3004   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
3005   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3006   PetscCheck(cusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
3007   switch (product->type) {
3008   case MATPRODUCT_AB:
3009     m = A->rmap->n;
3010     n = B->cmap->n;
3011     break;
3012   case MATPRODUCT_AtB:
3013     m = A->cmap->n;
3014     n = B->cmap->n;
3015     break;
3016   case MATPRODUCT_ABt:
3017     m = A->rmap->n;
3018     n = B->rmap->n;
3019     break;
3020   case MATPRODUCT_PtAP:
3021     m = B->cmap->n;
3022     n = B->cmap->n;
3023     break;
3024   case MATPRODUCT_RARt:
3025     m = B->rmap->n;
3026     n = B->rmap->n;
3027     break;
3028   default:
3029     SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
3030   }
3031   PetscCall(MatSetSizes(C,m,n,m,n));
3032   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
3033   PetscCall(PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense));
3034   PetscCall(MatSetType(C,MATSEQDENSECUDA));
3035 
3036   /* product data */
3037   PetscCall(PetscNew(&mmdata));
3038   mmdata->cisdense = cisdense;
3039  #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
3040   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
3041   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
3042     PetscCallCUDA(cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar)));
3043   }
3044  #endif
3045   /* for these products we need intermediate storage */
3046   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
3047     PetscCall(MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X));
3048     PetscCall(MatSetType(mmdata->X,MATSEQDENSECUDA));
3049     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
3050       PetscCall(MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n));
3051     } else {
3052       PetscCall(MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n));
3053     }
3054   }
3055   C->product->data    = mmdata;
3056   C->product->destroy = MatDestroy_MatMatCusparse;
3057 
3058   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
3059   PetscFunctionReturn(0);
3060 }
3061 
3062 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
3063 {
3064   Mat_Product                  *product = C->product;
3065   Mat                          A,B;
3066   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
3067   Mat_SeqAIJ                   *c = (Mat_SeqAIJ*)C->data;
3068   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
3069   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
3070   PetscBool                    flg;
3071   cusparseStatus_t             stat;
3072   MatProductType               ptype;
3073   MatMatCusparse               *mmdata;
3074 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3075   cusparseSpMatDescr_t         BmatSpDescr;
3076 #endif
3077   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
3078 
3079   PetscFunctionBegin;
3080   MatCheckProduct(C,1);
3081   PetscCheck(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
3082   PetscCall(PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg));
3083   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for C of type %s",((PetscObject)C)->type_name);
3084   mmdata = (MatMatCusparse*)C->product->data;
3085   A = product->A;
3086   B = product->B;
3087   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
3088     mmdata->reusesym = PETSC_FALSE;
3089     Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
3090     PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
3091     Cmat = Ccusp->mat;
3092     PetscCheck(Cmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]);
3093     Ccsr = (CsrMatrix*)Cmat->mat;
3094     PetscCheck(Ccsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
3095     goto finalize;
3096   }
3097   if (!c->nz) goto finalize;
3098   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
3099   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
3100   PetscCall(PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg));
3101   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
3102   PetscCheck(!A->boundtocpu,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
3103   PetscCheck(!B->boundtocpu,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
3104   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3105   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
3106   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
3107   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
3108   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
3109   PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
3110   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3111   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
3112 
3113   ptype = product->type;
3114   if (A->symmetric && ptype == MATPRODUCT_AtB) {
3115     ptype = MATPRODUCT_AB;
3116     PetscCheck(product->symbolic_used_the_fact_A_is_symmetric,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that A is symmetric");
3117   }
3118   if (B->symmetric && ptype == MATPRODUCT_ABt) {
3119     ptype = MATPRODUCT_AB;
3120     PetscCheck(product->symbolic_used_the_fact_B_is_symmetric,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that B is symmetric");
3121   }
3122   switch (ptype) {
3123   case MATPRODUCT_AB:
3124     Amat = Acusp->mat;
3125     Bmat = Bcusp->mat;
3126     break;
3127   case MATPRODUCT_AtB:
3128     Amat = Acusp->matTranspose;
3129     Bmat = Bcusp->mat;
3130     break;
3131   case MATPRODUCT_ABt:
3132     Amat = Acusp->mat;
3133     Bmat = Bcusp->matTranspose;
3134     break;
3135   default:
3136     SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
3137   }
3138   Cmat = Ccusp->mat;
3139   PetscCheck(Amat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
3140   PetscCheck(Bmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
3141   PetscCheck(Cmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[ptype]);
3142   Acsr = (CsrMatrix*)Amat->mat;
3143   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */
3144   Ccsr = (CsrMatrix*)Cmat->mat;
3145   PetscCheck(Acsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
3146   PetscCheck(Bcsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
3147   PetscCheck(Ccsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
3148   PetscCall(PetscLogGpuTimeBegin());
3149 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3150   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
3151   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3152   #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
3153     stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB,
3154                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
3155                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
3156                                mmdata->spgemmDesc);PetscCallCUSPARSE(stat);
3157   #else
3158     stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
3159                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
3160                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
3161                                mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);PetscCallCUSPARSE(stat);
3162     stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB,
3163                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
3164                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);PetscCallCUSPARSE(stat);
3165   #endif
3166 #else
3167   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB,
3168                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
3169                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
3170                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
3171                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());PetscCallCUSPARSE(stat);
3172 #endif
3173   PetscCall(PetscLogGpuFlops(mmdata->flops));
3174   PetscCallCUDA(WaitForCUDA());
3175   PetscCall(PetscLogGpuTimeEnd());
3176   C->offloadmask = PETSC_OFFLOAD_GPU;
3177 finalize:
3178   /* shorter version of MatAssemblyEnd_SeqAIJ */
3179   PetscCall(PetscInfo(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz));
3180   PetscCall(PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n"));
3181   PetscCall(PetscInfo(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax));
3182   c->reallocs         = 0;
3183   C->info.mallocs    += 0;
3184   C->info.nz_unneeded = 0;
3185   C->assembled = C->was_assembled = PETSC_TRUE;
3186   C->num_ass++;
3187   PetscFunctionReturn(0);
3188 }
3189 
3190 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
3191 {
3192   Mat_Product                  *product = C->product;
3193   Mat                          A,B;
3194   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
3195   Mat_SeqAIJ                   *a,*b,*c;
3196   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
3197   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
3198   PetscInt                     i,j,m,n,k;
3199   PetscBool                    flg;
3200   cusparseStatus_t             stat;
3201   MatProductType               ptype;
3202   MatMatCusparse               *mmdata;
3203   PetscLogDouble               flops;
3204   PetscBool                    biscompressed,ciscompressed;
3205 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3206   int64_t                      C_num_rows1, C_num_cols1, C_nnz1;
3207   cusparseSpMatDescr_t         BmatSpDescr;
3208 #else
3209   int                          cnz;
3210 #endif
3211   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
3212 
3213   PetscFunctionBegin;
3214   MatCheckProduct(C,1);
3215   PetscCheck(!C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
3216   A    = product->A;
3217   B    = product->B;
3218   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
3219   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
3220   PetscCall(PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg));
3221   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
3222   a = (Mat_SeqAIJ*)A->data;
3223   b = (Mat_SeqAIJ*)B->data;
3224   /* product data */
3225   PetscCall(PetscNew(&mmdata));
3226   C->product->data    = mmdata;
3227   C->product->destroy = MatDestroy_MatMatCusparse;
3228 
3229   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3230   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
3231   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */
3232   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
3233   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
3234   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
3235 
3236   ptype = product->type;
3237   if (A->symmetric && ptype == MATPRODUCT_AtB) {
3238     ptype = MATPRODUCT_AB;
3239     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
3240   }
3241   if (B->symmetric && ptype == MATPRODUCT_ABt) {
3242     ptype = MATPRODUCT_AB;
3243     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
3244   }
3245   biscompressed = PETSC_FALSE;
3246   ciscompressed = PETSC_FALSE;
3247   switch (ptype) {
3248   case MATPRODUCT_AB:
3249     m = A->rmap->n;
3250     n = B->cmap->n;
3251     k = A->cmap->n;
3252     Amat = Acusp->mat;
3253     Bmat = Bcusp->mat;
3254     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
3255     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
3256     break;
3257   case MATPRODUCT_AtB:
3258     m = A->cmap->n;
3259     n = B->cmap->n;
3260     k = A->rmap->n;
3261     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3262     Amat = Acusp->matTranspose;
3263     Bmat = Bcusp->mat;
3264     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
3265     break;
3266   case MATPRODUCT_ABt:
3267     m = A->rmap->n;
3268     n = B->rmap->n;
3269     k = A->cmap->n;
3270     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
3271     Amat = Acusp->mat;
3272     Bmat = Bcusp->matTranspose;
3273     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
3274     break;
3275   default:
3276     SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
3277   }
3278 
3279   /* create cusparse matrix */
3280   PetscCall(MatSetSizes(C,m,n,m,n));
3281   PetscCall(MatSetType(C,MATSEQAIJCUSPARSE));
3282   c     = (Mat_SeqAIJ*)C->data;
3283   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
3284   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
3285   Ccsr  = new CsrMatrix;
3286 
3287   c->compressedrow.use = ciscompressed;
3288   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
3289     c->compressedrow.nrows = a->compressedrow.nrows;
3290     PetscCall(PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex));
3291     PetscCall(PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows));
3292     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
3293     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
3294     Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows);
3295   } else {
3296     c->compressedrow.nrows  = 0;
3297     c->compressedrow.i      = NULL;
3298     c->compressedrow.rindex = NULL;
3299     Ccusp->workVector       = NULL;
3300     Cmat->cprowIndices      = NULL;
3301   }
3302   Ccusp->nrows    = ciscompressed ? c->compressedrow.nrows : m;
3303   Ccusp->mat      = Cmat;
3304   Ccusp->mat->mat = Ccsr;
3305   Ccsr->num_rows    = Ccusp->nrows;
3306   Ccsr->num_cols    = n;
3307   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1);
3308   PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
3309   PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
3310   PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
3311   PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar)));
3312   PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar)));
3313   PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar)));
3314   PetscCallCUDA(cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
3315   PetscCallCUDA(cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice));
3316   PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
3317   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
3318     thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0);
3319     c->nz = 0;
3320     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3321     Ccsr->values = new THRUSTARRAY(c->nz);
3322     goto finalizesym;
3323   }
3324 
3325   PetscCheck(Amat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
3326   PetscCheck(Bmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
3327   Acsr = (CsrMatrix*)Amat->mat;
3328   if (!biscompressed) {
3329     Bcsr = (CsrMatrix*)Bmat->mat;
3330 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3331     BmatSpDescr = Bmat->matDescr;
3332 #endif
3333   } else { /* we need to use row offsets for the full matrix */
3334     CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat;
3335     Bcsr = new CsrMatrix;
3336     Bcsr->num_rows       = B->rmap->n;
3337     Bcsr->num_cols       = cBcsr->num_cols;
3338     Bcsr->num_entries    = cBcsr->num_entries;
3339     Bcsr->column_indices = cBcsr->column_indices;
3340     Bcsr->values         = cBcsr->values;
3341     if (!Bcusp->rowoffsets_gpu) {
3342       Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
3343       Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
3344       PetscCall(PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt)));
3345     }
3346     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
3347     mmdata->Bcsr = Bcsr;
3348 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3349     if (Bcsr->num_rows && Bcsr->num_cols) {
3350       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries,
3351                                Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
3352                                Bcsr->values->data().get(),
3353                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
3354                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat);
3355     }
3356     BmatSpDescr = mmdata->matSpBDescr;
3357 #endif
3358   }
3359   PetscCheck(Acsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
3360   PetscCheck(Bcsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
3361   /* precompute flops count */
3362   if (ptype == MATPRODUCT_AB) {
3363     for (i=0, flops = 0; i<A->rmap->n; i++) {
3364       const PetscInt st = a->i[i];
3365       const PetscInt en = a->i[i+1];
3366       for (j=st; j<en; j++) {
3367         const PetscInt brow = a->j[j];
3368         flops += 2.*(b->i[brow+1] - b->i[brow]);
3369       }
3370     }
3371   } else if (ptype == MATPRODUCT_AtB) {
3372     for (i=0, flops = 0; i<A->rmap->n; i++) {
3373       const PetscInt anzi = a->i[i+1] - a->i[i];
3374       const PetscInt bnzi = b->i[i+1] - b->i[i];
3375       flops += (2.*anzi)*bnzi;
3376     }
3377   } else { /* TODO */
3378     flops = 0.;
3379   }
3380 
3381   mmdata->flops = flops;
3382   PetscCall(PetscLogGpuTimeBegin());
3383 
3384 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3385   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3386   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0,
3387                           NULL, NULL, NULL,
3388                           CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
3389                           CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat);
3390   PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc));
3391  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
3392  {
3393   /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
3394      We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
3395   */
3396   void*  dBuffer1 = NULL;
3397   void*  dBuffer2 = NULL;
3398   void*  dBuffer3 = NULL;
3399   /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
3400   size_t bufferSize1 = 0;
3401   size_t bufferSize2 = 0;
3402   size_t bufferSize3 = 0;
3403   size_t bufferSize4 = 0;
3404   size_t bufferSize5 = 0;
3405 
3406   /*----------------------------------------------------------------------*/
3407   /* ask bufferSize1 bytes for external memory */
3408   stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
3409                                             CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
3410                                             &bufferSize1, NULL);PetscCallCUSPARSE(stat);
3411   PetscCallCUDA(cudaMalloc((void**) &dBuffer1, bufferSize1));
3412   /* inspect the matrices A and B to understand the memory requirement for the next step */
3413   stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
3414                                             CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
3415                                             &bufferSize1, dBuffer1);PetscCallCUSPARSE(stat);
3416 
3417   /*----------------------------------------------------------------------*/
3418   stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
3419                                  CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
3420                                  &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);PetscCallCUSPARSE(stat);
3421   PetscCallCUDA(cudaMalloc((void**) &dBuffer2, bufferSize2));
3422   PetscCallCUDA(cudaMalloc((void**) &dBuffer3, bufferSize3));
3423   PetscCallCUDA(cudaMalloc((void**) &mmdata->dBuffer4, bufferSize4));
3424   stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
3425                                  CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
3426                                  &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);PetscCallCUSPARSE(stat);
3427   PetscCallCUDA(cudaFree(dBuffer1));
3428   PetscCallCUDA(cudaFree(dBuffer2));
3429 
3430   /*----------------------------------------------------------------------*/
3431   /* get matrix C non-zero entries C_nnz1 */
3432   PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
3433   c->nz = (PetscInt) C_nnz1;
3434   /* allocate matrix C */
3435   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3436   Ccsr->values         = new THRUSTARRAY(c->nz);PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3437   /* update matC with the new pointers */
3438   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
3439                                 Ccsr->values->data().get());PetscCallCUSPARSE(stat);
3440 
3441   /*----------------------------------------------------------------------*/
3442   stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
3443                                   CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
3444                                   &bufferSize5, NULL);PetscCallCUSPARSE(stat);
3445   PetscCallCUDA(cudaMalloc((void**) &mmdata->dBuffer5, bufferSize5));
3446   stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
3447                                   CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
3448                                   &bufferSize5, mmdata->dBuffer5);PetscCallCUSPARSE(stat);
3449   PetscCallCUDA(cudaFree(dBuffer3));
3450   stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB,
3451                                      Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
3452                                      cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
3453                                      mmdata->spgemmDesc);PetscCallCUSPARSE(stat);
3454   PetscCall(PetscInfo(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufferSize4/1024,bufferSize5/1024));
3455  }
3456  #else
3457   size_t bufSize2;
3458   /* ask bufferSize bytes for external memory */
3459   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB,
3460                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
3461                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
3462                                        mmdata->spgemmDesc, &bufSize2, NULL);PetscCallCUSPARSE(stat);
3463   PetscCallCUDA(cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2));
3464   /* inspect the matrices A and B to understand the memory requirement for the next step */
3465   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB,
3466                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
3467                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
3468                                        mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);PetscCallCUSPARSE(stat);
3469   /* ask bufferSize again bytes for external memory */
3470   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
3471                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
3472                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
3473                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);PetscCallCUSPARSE(stat);
3474   /* The CUSPARSE documentation is not clear, nor the API
3475      We need both buffers to perform the operations properly!
3476      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
3477      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
3478      is stored in the descriptor! What a messy API... */
3479   PetscCallCUDA(cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize));
3480   /* compute the intermediate product of A * B */
3481   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
3482                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
3483                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
3484                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);PetscCallCUSPARSE(stat);
3485   /* get matrix C non-zero entries C_nnz1 */
3486   PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
3487   c->nz = (PetscInt) C_nnz1;
3488   PetscCall(PetscInfo(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024));
3489   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3490   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3491   Ccsr->values = new THRUSTARRAY(c->nz);
3492   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3493   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
3494                                 Ccsr->values->data().get());PetscCallCUSPARSE(stat);
3495   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB,
3496                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
3497                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);PetscCallCUSPARSE(stat);
3498  #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0)
3499 #else
3500   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST));
3501   stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB,
3502                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
3503                              Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
3504                              Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
3505                              Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);PetscCallCUSPARSE(stat);
3506   c->nz = cnz;
3507   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3508   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3509   Ccsr->values = new THRUSTARRAY(c->nz);
3510   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3511 
3512   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3513   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
3514      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
3515      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
3516   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB,
3517                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
3518                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
3519                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
3520                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());PetscCallCUSPARSE(stat);
3521 #endif
3522   PetscCall(PetscLogGpuFlops(mmdata->flops));
3523   PetscCall(PetscLogGpuTimeEnd());
3524 finalizesym:
3525   c->singlemalloc = PETSC_FALSE;
3526   c->free_a       = PETSC_TRUE;
3527   c->free_ij      = PETSC_TRUE;
3528   PetscCall(PetscMalloc1(m+1,&c->i));
3529   PetscCall(PetscMalloc1(c->nz,&c->j));
3530   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
3531     PetscInt *d_i = c->i;
3532     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
3533     THRUSTINTARRAY jj(Ccsr->column_indices->size());
3534     ii   = *Ccsr->row_offsets;
3535     jj   = *Ccsr->column_indices;
3536     if (ciscompressed) d_i = c->compressedrow.i;
3537     PetscCallCUDA(cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
3538     PetscCallCUDA(cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
3539   } else {
3540     PetscInt *d_i = c->i;
3541     if (ciscompressed) d_i = c->compressedrow.i;
3542     PetscCallCUDA(cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
3543     PetscCallCUDA(cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
3544   }
3545   if (ciscompressed) { /* need to expand host row offsets */
3546     PetscInt r = 0;
3547     c->i[0] = 0;
3548     for (k = 0; k < c->compressedrow.nrows; k++) {
3549       const PetscInt next = c->compressedrow.rindex[k];
3550       const PetscInt old = c->compressedrow.i[k];
3551       for (; r < next; r++) c->i[r+1] = old;
3552     }
3553     for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows];
3554   }
3555   PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt)));
3556   PetscCall(PetscMalloc1(m,&c->ilen));
3557   PetscCall(PetscMalloc1(m,&c->imax));
3558   c->maxnz = c->nz;
3559   c->nonzerorowcnt = 0;
3560   c->rmax = 0;
3561   for (k = 0; k < m; k++) {
3562     const PetscInt nn = c->i[k+1] - c->i[k];
3563     c->ilen[k] = c->imax[k] = nn;
3564     c->nonzerorowcnt += (PetscInt)!!nn;
3565     c->rmax = PetscMax(c->rmax,nn);
3566   }
3567   PetscCall(MatMarkDiagonal_SeqAIJ(C));
3568   PetscCall(PetscMalloc1(c->nz,&c->a));
3569   Ccsr->num_entries = c->nz;
3570 
3571   C->nonzerostate++;
3572   PetscCall(PetscLayoutSetUp(C->rmap));
3573   PetscCall(PetscLayoutSetUp(C->cmap));
3574   Ccusp->nonzerostate = C->nonzerostate;
3575   C->offloadmask   = PETSC_OFFLOAD_UNALLOCATED;
3576   C->preallocated  = PETSC_TRUE;
3577   C->assembled     = PETSC_FALSE;
3578   C->was_assembled = PETSC_FALSE;
3579   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
3580     mmdata->reusesym = PETSC_TRUE;
3581     C->offloadmask   = PETSC_OFFLOAD_GPU;
3582   }
3583   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3584   PetscFunctionReturn(0);
3585 }
3586 
3587 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
3588 
3589 /* handles sparse or dense B */
3590 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
3591 {
3592   Mat_Product    *product = mat->product;
3593   PetscBool      isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE;
3594 
3595   PetscFunctionBegin;
3596   MatCheckProduct(mat,1);
3597   PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense));
3598   if (!product->A->boundtocpu && !product->B->boundtocpu) {
3599     PetscCall(PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp));
3600   }
3601   if (product->type == MATPRODUCT_ABC) {
3602     Ciscusp = PETSC_FALSE;
3603     if (!product->C->boundtocpu) {
3604       PetscCall(PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp));
3605     }
3606   }
3607   if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
3608     PetscBool usecpu = PETSC_FALSE;
3609     switch (product->type) {
3610     case MATPRODUCT_AB:
3611       if (product->api_user) {
3612         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMult","Mat");
3613         PetscCall(PetscOptionsBool("-matmatmult_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL));
3614         PetscOptionsEnd();
3615       } else {
3616         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AB","Mat");
3617         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL));
3618         PetscOptionsEnd();
3619       }
3620       break;
3621     case MATPRODUCT_AtB:
3622       if (product->api_user) {
3623         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatTransposeMatMult","Mat");
3624         PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL));
3625         PetscOptionsEnd();
3626       } else {
3627         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AtB","Mat");
3628         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL));
3629         PetscOptionsEnd();
3630       }
3631       break;
3632     case MATPRODUCT_PtAP:
3633       if (product->api_user) {
3634         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatPtAP","Mat");
3635         PetscCall(PetscOptionsBool("-matptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL));
3636         PetscOptionsEnd();
3637       } else {
3638         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_PtAP","Mat");
3639         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL));
3640         PetscOptionsEnd();
3641       }
3642       break;
3643     case MATPRODUCT_RARt:
3644       if (product->api_user) {
3645         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatRARt","Mat");
3646         PetscCall(PetscOptionsBool("-matrart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL));
3647         PetscOptionsEnd();
3648       } else {
3649         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_RARt","Mat");
3650         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL));
3651         PetscOptionsEnd();
3652       }
3653       break;
3654     case MATPRODUCT_ABC:
3655       if (product->api_user) {
3656         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMatMult","Mat");
3657         PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL));
3658         PetscOptionsEnd();
3659       } else {
3660         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_ABC","Mat");
3661         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL));
3662         PetscOptionsEnd();
3663       }
3664       break;
3665     default:
3666       break;
3667     }
3668     if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
3669   }
3670   /* dispatch */
3671   if (isdense) {
3672     switch (product->type) {
3673     case MATPRODUCT_AB:
3674     case MATPRODUCT_AtB:
3675     case MATPRODUCT_ABt:
3676     case MATPRODUCT_PtAP:
3677     case MATPRODUCT_RARt:
3678      if (product->A->boundtocpu) {
3679         PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat));
3680       } else {
3681         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
3682       }
3683       break;
3684     case MATPRODUCT_ABC:
3685       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3686       break;
3687     default:
3688       break;
3689     }
3690   } else if (Biscusp && Ciscusp) {
3691     switch (product->type) {
3692     case MATPRODUCT_AB:
3693     case MATPRODUCT_AtB:
3694     case MATPRODUCT_ABt:
3695       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3696       break;
3697     case MATPRODUCT_PtAP:
3698     case MATPRODUCT_RARt:
3699     case MATPRODUCT_ABC:
3700       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3701       break;
3702     default:
3703       break;
3704     }
3705   } else { /* fallback for AIJ */
3706     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
3707   }
3708   PetscFunctionReturn(0);
3709 }
3710 
3711 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
3712 {
3713   PetscFunctionBegin;
3714   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE));
3715   PetscFunctionReturn(0);
3716 }
3717 
3718 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
3719 {
3720   PetscFunctionBegin;
3721   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE));
3722   PetscFunctionReturn(0);
3723 }
3724 
3725 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
3726 {
3727   PetscFunctionBegin;
3728   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE));
3729   PetscFunctionReturn(0);
3730 }
3731 
3732 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
3733 {
3734   PetscFunctionBegin;
3735   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE));
3736   PetscFunctionReturn(0);
3737 }
3738 
3739 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
3740 {
3741   PetscFunctionBegin;
3742   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE));
3743   PetscFunctionReturn(0);
3744 }
3745 
3746 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y)
3747 {
3748   int i = blockIdx.x*blockDim.x + threadIdx.x;
3749   if (i < n) y[idx[i]] += x[i];
3750 }
3751 
3752 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
3753 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
3754 {
3755   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
3756   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
3757   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
3758   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
3759   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
3760   PetscBool                    compressed;
3761 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3762   PetscInt                     nx,ny;
3763 #endif
3764 
3765   PetscFunctionBegin;
3766   PetscCheck(!herm || trans,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Hermitian and not transpose not supported");
3767   if (!a->nz) {
3768     if (!yy) PetscCall(VecSet_SeqCUDA(zz,0));
3769     else PetscCall(VecCopy_SeqCUDA(yy,zz));
3770     PetscFunctionReturn(0);
3771   }
3772   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
3773   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3774   if (!trans) {
3775     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
3776     PetscCheck(matstruct,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
3777   } else {
3778     if (herm || !A->form_explicit_transpose) {
3779       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
3780       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
3781     } else {
3782       if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3783       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
3784     }
3785   }
3786   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3787   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
3788 
3789   try {
3790     PetscCall(VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray));
3791     if (yy == zz) PetscCall(VecCUDAGetArray(zz,&zarray)); /* read & write zz, so need to get uptodate zarray on GPU */
3792     else PetscCall(VecCUDAGetArrayWrite(zz,&zarray)); /* write zz, so no need to init zarray on GPU */
3793 
3794     PetscCall(PetscLogGpuTimeBegin());
3795     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3796       /* z = A x + beta y.
3797          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3798          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3799       */
3800       xptr = xarray;
3801       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
3802       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3803      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3804       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3805           allocated to accommodate different uses. So we get the length info directly from mat.
3806        */
3807       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3808         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3809         nx = mat->num_cols;
3810         ny = mat->num_rows;
3811       }
3812      #endif
3813     } else {
3814       /* z = A^T x + beta y
3815          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3816          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3817        */
3818       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
3819       dptr = zarray;
3820       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3821       if (compressed) { /* Scatter x to work vector */
3822         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3823         thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
3824                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3825                          VecCUDAEqualsReverse());
3826       }
3827      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3828       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3829         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3830         nx = mat->num_rows;
3831         ny = mat->num_cols;
3832       }
3833      #endif
3834     }
3835 
3836     /* csr_spmv does y = alpha op(A) x + beta y */
3837     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3838      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3839       PetscCheck(opA >= 0 && opA <= 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
3840       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
3841         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype));
3842         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype));
3843         PetscCallCUSPARSE(cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
3844                                                matstruct->matDescr,
3845                                                matstruct->cuSpMV[opA].vecXDescr, beta,
3846                                                matstruct->cuSpMV[opA].vecYDescr,
3847                                                cusparse_scalartype,
3848                                                cusparsestruct->spmvAlg,
3849                                                &matstruct->cuSpMV[opA].spmvBufferSize));
3850         PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize));
3851 
3852         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3853       } else {
3854         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
3855         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr));
3856         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr));
3857       }
3858 
3859       PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA,
3860                                   matstruct->alpha_one,
3861                                   matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */
3862                                   matstruct->cuSpMV[opA].vecXDescr,
3863                                   beta,
3864                                   matstruct->cuSpMV[opA].vecYDescr,
3865                                   cusparse_scalartype,
3866                                   cusparsestruct->spmvAlg,
3867                                   matstruct->cuSpMV[opA].spmvBuffer));
3868      #else
3869       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3870       PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA,
3871                                        mat->num_rows, mat->num_cols,
3872                                        mat->num_entries, matstruct->alpha_one, matstruct->descr,
3873                                        mat->values->data().get(), mat->row_offsets->data().get(),
3874                                        mat->column_indices->data().get(), xptr, beta,
3875                                        dptr));
3876      #endif
3877     } else {
3878       if (cusparsestruct->nrows) {
3879        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3880         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3881        #else
3882         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
3883         PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA,
3884                                          matstruct->alpha_one, matstruct->descr, hybMat,
3885                                          xptr, beta,
3886                                          dptr));
3887        #endif
3888       }
3889     }
3890     PetscCall(PetscLogGpuTimeEnd());
3891 
3892     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3893       if (yy) { /* MatMultAdd: zz = A*xx + yy */
3894         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3895           PetscCall(VecCopy_SeqCUDA(yy,zz)); /* zz = yy */
3896         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3897           PetscCall(VecAXPY_SeqCUDA(zz,1.0,yy)); /* zz += yy */
3898         }
3899       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3900         PetscCall(VecSet_SeqCUDA(zz,0));
3901       }
3902 
3903       /* ScatterAdd the result from work vector into the full vector when A is compressed */
3904       if (compressed) {
3905         PetscCall(PetscLogGpuTimeBegin());
3906         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred)
3907            and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
3908            prevent that. So I just add a ScatterAdd kernel.
3909          */
3910        #if 0
3911         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
3912         thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
3913                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
3914                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3915                          VecCUDAPlusEquals());
3916        #else
3917         PetscInt n = matstruct->cprowIndices->size();
3918         ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray);
3919        #endif
3920         PetscCall(PetscLogGpuTimeEnd());
3921       }
3922     } else {
3923       if (yy && yy != zz) {
3924         PetscCall(VecAXPY_SeqCUDA(zz,1.0,yy)); /* zz += yy */
3925       }
3926     }
3927     PetscCall(VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray));
3928     if (yy == zz) PetscCall(VecCUDARestoreArray(zz,&zarray));
3929     else PetscCall(VecCUDARestoreArrayWrite(zz,&zarray));
3930   } catch(char *ex) {
3931     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
3932   }
3933   if (yy) {
3934     PetscCall(PetscLogGpuFlops(2.0*a->nz));
3935   } else {
3936     PetscCall(PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt));
3937   }
3938   PetscFunctionReturn(0);
3939 }
3940 
3941 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
3942 {
3943   PetscFunctionBegin;
3944   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE));
3945   PetscFunctionReturn(0);
3946 }
3947 
3948 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
3949 {
3950   PetscObjectState   onnz = A->nonzerostate;
3951   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3952 
3953   PetscFunctionBegin;
3954   PetscCall(MatAssemblyEnd_SeqAIJ(A,mode));
3955   if (onnz != A->nonzerostate && cusp->deviceMat) {
3956 
3957     PetscCall(PetscInfo(A,"Destroy device mat since nonzerostate changed\n"));
3958     PetscCallCUDA(cudaFree(cusp->deviceMat));
3959     cusp->deviceMat = NULL;
3960   }
3961   PetscFunctionReturn(0);
3962 }
3963 
3964 /* --------------------------------------------------------------------------------*/
3965 /*@
3966    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
3967    (the default parallel PETSc format). This matrix will ultimately pushed down
3968    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
3969    assembly performance the user should preallocate the matrix storage by setting
3970    the parameter nz (or the array nnz).  By setting these parameters accurately,
3971    performance during matrix assembly can be increased by more than a factor of 50.
3972 
3973    Collective
3974 
3975    Input Parameters:
3976 +  comm - MPI communicator, set to PETSC_COMM_SELF
3977 .  m - number of rows
3978 .  n - number of columns
3979 .  nz - number of nonzeros per row (same for all rows)
3980 -  nnz - array containing the number of nonzeros in the various rows
3981          (possibly different for each row) or NULL
3982 
3983    Output Parameter:
3984 .  A - the matrix
3985 
3986    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
3987    MatXXXXSetPreallocation() paradgm instead of this routine directly.
3988    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
3989 
3990    Notes:
3991    If nnz is given then nz is ignored
3992 
3993    The AIJ format (also called the Yale sparse matrix format or
3994    compressed row storage), is fully compatible with standard Fortran 77
3995    storage.  That is, the stored row and column indices can begin at
3996    either one (as in Fortran) or zero.  See the users' manual for details.
3997 
3998    Specify the preallocated storage with either nz or nnz (not both).
3999    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
4000    allocation.  For large problems you MUST preallocate memory or you
4001    will get TERRIBLE performance, see the users' manual chapter on matrices.
4002 
4003    By default, this format uses inodes (identical nodes) when possible, to
4004    improve numerical efficiency of matrix-vector products and solves. We
4005    search for consecutive rows with the same nonzero structure, thereby
4006    reusing matrix information to achieve increased efficiency.
4007 
4008    Level: intermediate
4009 
4010 .seealso: `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATSEQAIJCUSPARSE`, `MATAIJCUSPARSE`
4011 @*/
4012 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
4013 {
4014   PetscFunctionBegin;
4015   PetscCall(MatCreate(comm,A));
4016   PetscCall(MatSetSizes(*A,m,n,m,n));
4017   PetscCall(MatSetType(*A,MATSEQAIJCUSPARSE));
4018   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz));
4019   PetscFunctionReturn(0);
4020 }
4021 
4022 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
4023 {
4024   PetscFunctionBegin;
4025   if (A->factortype == MAT_FACTOR_NONE) {
4026     PetscCall(MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr));
4027   } else {
4028     PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr));
4029   }
4030   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL));
4031   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL));
4032   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetUseCPUSolve_C",NULL));
4033   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL));
4034   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL));
4035   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL));
4036   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL));
4037   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL));
4038   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL));
4039   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaijcusparse_hypre_C",NULL));
4040   PetscCall(MatDestroy_SeqAIJ(A));
4041   PetscFunctionReturn(0);
4042 }
4043 
4044 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
4045 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
4046 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
4047 {
4048   PetscFunctionBegin;
4049   PetscCall(MatDuplicate_SeqAIJ(A,cpvalues,B));
4050   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B));
4051   PetscFunctionReturn(0);
4052 }
4053 
4054 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str)
4055 {
4056   Mat_SeqAIJ         *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
4057   Mat_SeqAIJCUSPARSE *cy;
4058   Mat_SeqAIJCUSPARSE *cx;
4059   PetscScalar        *ay;
4060   const PetscScalar  *ax;
4061   CsrMatrix          *csry,*csrx;
4062 
4063   PetscFunctionBegin;
4064   cy = (Mat_SeqAIJCUSPARSE*)Y->spptr;
4065   cx = (Mat_SeqAIJCUSPARSE*)X->spptr;
4066   if (X->ops->axpy != Y->ops->axpy) {
4067     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE));
4068     PetscCall(MatAXPY_SeqAIJ(Y,a,X,str));
4069     PetscFunctionReturn(0);
4070   }
4071   /* if we are here, it means both matrices are bound to GPU */
4072   PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y));
4073   PetscCall(MatSeqAIJCUSPARSECopyToGPU(X));
4074   PetscCheck(cy->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)Y),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
4075   PetscCheck(cx->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)X),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
4076   csry = (CsrMatrix*)cy->mat->mat;
4077   csrx = (CsrMatrix*)cx->mat->mat;
4078   /* see if we can turn this into a cublas axpy */
4079   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
4080     bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin());
4081     if (eq) {
4082       eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin());
4083     }
4084     if (eq) str = SAME_NONZERO_PATTERN;
4085   }
4086   /* spgeam is buggy with one column */
4087   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
4088 
4089   if (str == SUBSET_NONZERO_PATTERN) {
4090     PetscScalar b = 1.0;
4091 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4092     size_t      bufferSize;
4093     void        *buffer;
4094 #endif
4095 
4096     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X,&ax));
4097     PetscCall(MatSeqAIJCUSPARSEGetArray(Y,&ay));
4098     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST));
4099 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4100     PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n,
4101                                                   &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
4102                                                   &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
4103                                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize));
4104     PetscCallCUDA(cudaMalloc(&buffer,bufferSize));
4105     PetscCall(PetscLogGpuTimeBegin());
4106     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
4107                                        &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
4108                                        &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
4109                                        cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer));
4110     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
4111     PetscCall(PetscLogGpuTimeEnd());
4112     PetscCallCUDA(cudaFree(buffer));
4113 #else
4114     PetscCall(PetscLogGpuTimeBegin());
4115     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
4116                                        &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
4117                                        &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
4118                                        cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get()));
4119     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
4120     PetscCall(PetscLogGpuTimeEnd());
4121 #endif
4122     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE));
4123     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X,&ax));
4124     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y,&ay));
4125     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
4126   } else if (str == SAME_NONZERO_PATTERN) {
4127     cublasHandle_t cublasv2handle;
4128     PetscBLASInt   one = 1, bnz = 1;
4129 
4130     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X,&ax));
4131     PetscCall(MatSeqAIJCUSPARSEGetArray(Y,&ay));
4132     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
4133     PetscCall(PetscBLASIntCast(x->nz,&bnz));
4134     PetscCall(PetscLogGpuTimeBegin());
4135     PetscCallCUBLAS(cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one));
4136     PetscCall(PetscLogGpuFlops(2.0*bnz));
4137     PetscCall(PetscLogGpuTimeEnd());
4138     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X,&ax));
4139     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y,&ay));
4140     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
4141   } else {
4142     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE));
4143     PetscCall(MatAXPY_SeqAIJ(Y,a,X,str));
4144   }
4145   PetscFunctionReturn(0);
4146 }
4147 
4148 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a)
4149 {
4150   Mat_SeqAIJ     *y = (Mat_SeqAIJ*)Y->data;
4151   PetscScalar    *ay;
4152   cublasHandle_t cublasv2handle;
4153   PetscBLASInt   one = 1, bnz = 1;
4154 
4155   PetscFunctionBegin;
4156   PetscCall(MatSeqAIJCUSPARSEGetArray(Y,&ay));
4157   PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
4158   PetscCall(PetscBLASIntCast(y->nz,&bnz));
4159   PetscCall(PetscLogGpuTimeBegin());
4160   PetscCallCUBLAS(cublasXscal(cublasv2handle,bnz,&a,ay,one));
4161   PetscCall(PetscLogGpuFlops(bnz));
4162   PetscCall(PetscLogGpuTimeEnd());
4163   PetscCall(MatSeqAIJCUSPARSERestoreArray(Y,&ay));
4164   PetscCall(MatSeqAIJInvalidateDiagonal(Y));
4165   PetscFunctionReturn(0);
4166 }
4167 
4168 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
4169 {
4170   PetscBool      both = PETSC_FALSE;
4171   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
4172 
4173   PetscFunctionBegin;
4174   if (A->factortype == MAT_FACTOR_NONE) {
4175     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
4176     if (spptr->mat) {
4177       CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat;
4178       if (matrix->values) {
4179         both = PETSC_TRUE;
4180         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
4181       }
4182     }
4183     if (spptr->matTranspose) {
4184       CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat;
4185       if (matrix->values) {
4186         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
4187       }
4188     }
4189   }
4190   PetscCall(PetscArrayzero(a->a,a->i[A->rmap->n]));
4191   PetscCall(MatSeqAIJInvalidateDiagonal(A));
4192   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
4193   else A->offloadmask = PETSC_OFFLOAD_CPU;
4194   PetscFunctionReturn(0);
4195 }
4196 
4197 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
4198 {
4199   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
4200 
4201   PetscFunctionBegin;
4202   if (A->factortype != MAT_FACTOR_NONE) {
4203     A->boundtocpu = flg;
4204     PetscFunctionReturn(0);
4205   }
4206   if (flg) {
4207     PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
4208 
4209     A->ops->scale                     = MatScale_SeqAIJ;
4210     A->ops->axpy                      = MatAXPY_SeqAIJ;
4211     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
4212     A->ops->mult                      = MatMult_SeqAIJ;
4213     A->ops->multadd                   = MatMultAdd_SeqAIJ;
4214     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
4215     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
4216     A->ops->multhermitiantranspose    = NULL;
4217     A->ops->multhermitiantransposeadd = NULL;
4218     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
4219     PetscCall(PetscMemzero(a->ops,sizeof(Mat_SeqAIJOps)));
4220     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL));
4221     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL));
4222     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL));
4223     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL));
4224     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL));
4225     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL));
4226   } else {
4227     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
4228     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
4229     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
4230     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
4231     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
4232     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
4233     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
4234     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
4235     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
4236     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
4237     a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJCUSPARSE;
4238     a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJCUSPARSE;
4239     a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE;
4240     a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE;
4241     a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE;
4242     a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE;
4243     a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE;
4244 
4245     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE));
4246     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE));
4247     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE));
4248     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE));
4249     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE));
4250     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE));
4251    }
4252   A->boundtocpu = flg;
4253   if (flg && a->inode.size) {
4254     a->inode.use = PETSC_TRUE;
4255   } else {
4256     a->inode.use = PETSC_FALSE;
4257   }
4258   PetscFunctionReturn(0);
4259 }
4260 
4261 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
4262 {
4263   Mat              B;
4264 
4265   PetscFunctionBegin;
4266   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */
4267   if (reuse == MAT_INITIAL_MATRIX) {
4268     PetscCall(MatDuplicate(A,MAT_COPY_VALUES,newmat));
4269   } else if (reuse == MAT_REUSE_MATRIX) {
4270     PetscCall(MatCopy(A,*newmat,SAME_NONZERO_PATTERN));
4271   }
4272   B = *newmat;
4273 
4274   PetscCall(PetscFree(B->defaultvectype));
4275   PetscCall(PetscStrallocpy(VECCUDA,&B->defaultvectype));
4276 
4277   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
4278     if (B->factortype == MAT_FACTOR_NONE) {
4279       Mat_SeqAIJCUSPARSE *spptr;
4280       PetscCall(PetscNew(&spptr));
4281       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
4282       PetscCallCUSPARSE(cusparseSetStream(spptr->handle,PetscDefaultCudaStream));
4283       spptr->format     = MAT_CUSPARSE_CSR;
4284      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4285      #if CUSPARSE_VERSION > 11301
4286       spptr->spmvAlg    = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
4287      #else
4288       spptr->spmvAlg    = CUSPARSE_CSRMV_ALG1;    /* default, since we only support csr */
4289      #endif
4290       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
4291       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
4292      #endif
4293       B->spptr = spptr;
4294     } else {
4295       Mat_SeqAIJCUSPARSETriFactors *spptr;
4296 
4297       PetscCall(PetscNew(&spptr));
4298       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
4299       PetscCallCUSPARSE(cusparseSetStream(spptr->handle,PetscDefaultCudaStream));
4300       B->spptr = spptr;
4301     }
4302     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
4303   }
4304   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
4305   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
4306   B->ops->setoption      = MatSetOption_SeqAIJCUSPARSE;
4307   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
4308   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
4309   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
4310 
4311   PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE));
4312   PetscCall(PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE));
4313   PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE));
4314 #if defined(PETSC_HAVE_HYPRE)
4315   PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijcusparse_hypre_C",MatConvert_AIJ_HYPRE));
4316 #endif
4317   PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetUseCPUSolve_C",MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE));
4318   PetscFunctionReturn(0);
4319 }
4320 
4321 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
4322 {
4323   PetscFunctionBegin;
4324   PetscCall(MatCreate_SeqAIJ(B));
4325   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B));
4326   PetscFunctionReturn(0);
4327 }
4328 
4329 /*MC
4330    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
4331 
4332    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
4333    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
4334    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
4335 
4336    Options Database Keys:
4337 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
4338 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
4339 -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
4340 +  -mat_cusparse_use_cpu_solve - Do MatSolve on CPU
4341 
4342   Level: beginner
4343 
4344 .seealso: `MatCreateSeqAIJCUSPARSE()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
4345 M*/
4346 
4347 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*);
4348 
4349 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
4350 {
4351   PetscFunctionBegin;
4352   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSEBAND,MATSEQAIJ,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band));
4353   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse));
4354   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse));
4355   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse));
4356   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse));
4357 
4358   PetscFunctionReturn(0);
4359 }
4360 
4361 static PetscErrorCode MatResetPreallocationCOO_SeqAIJCUSPARSE(Mat mat)
4362 {
4363   Mat_SeqAIJCUSPARSE* cusp = (Mat_SeqAIJCUSPARSE*)mat->spptr;
4364 
4365   PetscFunctionBegin;
4366   if (!cusp) PetscFunctionReturn(0);
4367   delete cusp->cooPerm;
4368   delete cusp->cooPerm_a;
4369   cusp->cooPerm = NULL;
4370   cusp->cooPerm_a = NULL;
4371   if (cusp->use_extended_coo) {
4372     PetscCallCUDA(cudaFree(cusp->jmap_d));
4373     PetscCallCUDA(cudaFree(cusp->perm_d));
4374   }
4375   cusp->use_extended_coo = PETSC_FALSE;
4376   PetscFunctionReturn(0);
4377 }
4378 
4379 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
4380 {
4381   PetscFunctionBegin;
4382   if (*cusparsestruct) {
4383     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format));
4384     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format));
4385     delete (*cusparsestruct)->workVector;
4386     delete (*cusparsestruct)->rowoffsets_gpu;
4387     delete (*cusparsestruct)->cooPerm;
4388     delete (*cusparsestruct)->cooPerm_a;
4389     delete (*cusparsestruct)->csr2csc_i;
4390     if ((*cusparsestruct)->handle) PetscCallCUSPARSE(cusparseDestroy((*cusparsestruct)->handle));
4391     if ((*cusparsestruct)->jmap_d) PetscCallCUDA(cudaFree((*cusparsestruct)->jmap_d));
4392     if ((*cusparsestruct)->perm_d) PetscCallCUDA(cudaFree((*cusparsestruct)->perm_d));
4393     PetscCall(PetscFree(*cusparsestruct));
4394   }
4395   PetscFunctionReturn(0);
4396 }
4397 
4398 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
4399 {
4400   PetscFunctionBegin;
4401   if (*mat) {
4402     delete (*mat)->values;
4403     delete (*mat)->column_indices;
4404     delete (*mat)->row_offsets;
4405     delete *mat;
4406     *mat = 0;
4407   }
4408   PetscFunctionReturn(0);
4409 }
4410 
4411 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
4412 {
4413   PetscFunctionBegin;
4414   if (*trifactor) {
4415     if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr));
4416     if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparseDestroyCsrsvInfo((*trifactor)->solveInfo));
4417     PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat));
4418     if ((*trifactor)->solveBuffer)   PetscCallCUDA(cudaFree((*trifactor)->solveBuffer));
4419     if ((*trifactor)->AA_h)   PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h));
4420    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4421     if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer));
4422    #endif
4423     PetscCall(PetscFree(*trifactor));
4424   }
4425   PetscFunctionReturn(0);
4426 }
4427 
4428 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
4429 {
4430   CsrMatrix        *mat;
4431 
4432   PetscFunctionBegin;
4433   if (*matstruct) {
4434     if ((*matstruct)->mat) {
4435       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
4436        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4437         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
4438        #else
4439         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
4440         PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat));
4441        #endif
4442       } else {
4443         mat = (CsrMatrix*)(*matstruct)->mat;
4444         CsrMatrix_Destroy(&mat);
4445       }
4446     }
4447     if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr));
4448     delete (*matstruct)->cprowIndices;
4449     if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one));
4450     if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero));
4451     if ((*matstruct)->beta_one)  PetscCallCUDA(cudaFree((*matstruct)->beta_one));
4452 
4453    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4454     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
4455     if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr));
4456     for (int i=0; i<3; i++) {
4457       if (mdata->cuSpMV[i].initialized) {
4458         PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer));
4459         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr));
4460         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr));
4461       }
4462     }
4463    #endif
4464     delete *matstruct;
4465     *matstruct = NULL;
4466   }
4467   PetscFunctionReturn(0);
4468 }
4469 
4470 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p* trifactors)
4471 {
4472   Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors;
4473 
4474   PetscFunctionBegin;
4475   if (fs) {
4476     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtr));
4477     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtr));
4478     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose));
4479     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose));
4480     delete fs->rpermIndices;
4481     delete fs->cpermIndices;
4482     delete fs->workVector;
4483     fs->rpermIndices = NULL;
4484     fs->cpermIndices = NULL;
4485     fs->workVector = NULL;
4486     if (fs->a_band_d)   PetscCallCUDA(cudaFree(fs->a_band_d));
4487     if (fs->i_band_d)   PetscCallCUDA(cudaFree(fs->i_band_d));
4488     fs->init_dev_prop = PETSC_FALSE;
4489    #if CUSPARSE_VERSION >= 11500
4490     PetscCallCUDA(cudaFree(fs->csrRowPtr));
4491     PetscCallCUDA(cudaFree(fs->csrColIdx));
4492     PetscCallCUDA(cudaFree(fs->csrVal));
4493     PetscCallCUDA(cudaFree(fs->X));
4494     PetscCallCUDA(cudaFree(fs->Y));
4495     // PetscCallCUDA(cudaFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */
4496     PetscCallCUDA(cudaFree(fs->spsvBuffer_L));
4497     PetscCallCUDA(cudaFree(fs->spsvBuffer_U));
4498     PetscCallCUDA(cudaFree(fs->spsvBuffer_Lt));
4499     PetscCallCUDA(cudaFree(fs->spsvBuffer_Ut));
4500     PetscCallCUSPARSE(cusparseDestroyMatDescr(fs->matDescr_M));
4501     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_L));
4502     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_U));
4503     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_L));
4504     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Lt));
4505     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_U));
4506     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Ut));
4507     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_X));
4508     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_Y));
4509     PetscCallCUSPARSE(cusparseDestroyCsrilu02Info(fs->ilu0Info_M));
4510     PetscCallCUSPARSE(cusparseDestroyCsric02Info(fs->ic0Info_M));
4511 
4512     fs->createdTransposeSpSVDescr    = PETSC_FALSE;
4513     fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
4514    #endif
4515   }
4516   PetscFunctionReturn(0);
4517 }
4518 
4519 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
4520 {
4521   cusparseHandle_t handle;
4522 
4523   PetscFunctionBegin;
4524   if (*trifactors) {
4525     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors));
4526     if (handle = (*trifactors)->handle) {
4527       PetscCallCUSPARSE(cusparseDestroy(handle));
4528     }
4529     PetscCall(PetscFree(*trifactors));
4530   }
4531   PetscFunctionReturn(0);
4532 }
4533 
4534 struct IJCompare
4535 {
4536   __host__ __device__
4537   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
4538   {
4539     if (t1.get<0>() < t2.get<0>()) return true;
4540     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
4541     return false;
4542   }
4543 };
4544 
4545 struct IJEqual
4546 {
4547   __host__ __device__
4548   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
4549   {
4550     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
4551     return true;
4552   }
4553 };
4554 
4555 struct IJDiff
4556 {
4557   __host__ __device__
4558   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
4559   {
4560     return t1 == t2 ? 0 : 1;
4561   }
4562 };
4563 
4564 struct IJSum
4565 {
4566   __host__ __device__
4567   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
4568   {
4569     return t1||t2;
4570   }
4571 };
4572 
4573 #include <thrust/iterator/discard_iterator.h>
4574 /* Associated with MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic() */
4575 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode)
4576 {
4577   Mat_SeqAIJCUSPARSE                    *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4578   Mat_SeqAIJ                            *a = (Mat_SeqAIJ*)A->data;
4579   THRUSTARRAY                           *cooPerm_v = NULL;
4580   thrust::device_ptr<const PetscScalar> d_v;
4581   CsrMatrix                             *matrix;
4582   PetscInt                              n;
4583 
4584   PetscFunctionBegin;
4585   PetscCheck(cusp,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct");
4586   PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix");
4587   if (!cusp->cooPerm) {
4588     PetscCall(MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY));
4589     PetscCall(MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY));
4590     PetscFunctionReturn(0);
4591   }
4592   matrix = (CsrMatrix*)cusp->mat->mat;
4593   PetscCheck(matrix->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4594   if (!v) {
4595     if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
4596     goto finalize;
4597   }
4598   n = cusp->cooPerm->size();
4599   if (isCudaMem(v)) {
4600     d_v = thrust::device_pointer_cast(v);
4601   } else {
4602     cooPerm_v = new THRUSTARRAY(n);
4603     cooPerm_v->assign(v,v+n);
4604     d_v = cooPerm_v->data();
4605     PetscCall(PetscLogCpuToGpu(n*sizeof(PetscScalar)));
4606   }
4607   PetscCall(PetscLogGpuTimeBegin());
4608   if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
4609     if (cusp->cooPerm_a) { /* there are repeated entries in d_v[], and we need to add these them */
4610       THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
4611       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
4612       /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output)
4613         cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[].
4614         cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero.
4615       */
4616       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
4617       thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>());
4618       delete cooPerm_w;
4619     } else {
4620       /* all nonzeros in d_v[] are unique entries */
4621       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
4622                                                                 matrix->values->begin()));
4623       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
4624                                                                 matrix->values->end()));
4625       thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]]  */
4626     }
4627   } else {
4628     if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
4629       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
4630       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
4631     } else {
4632       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
4633                                                                 matrix->values->begin()));
4634       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
4635                                                                 matrix->values->end()));
4636       thrust::for_each(zibit,zieit,VecCUDAEquals());
4637     }
4638   }
4639   PetscCall(PetscLogGpuTimeEnd());
4640 finalize:
4641   delete cooPerm_v;
4642   A->offloadmask = PETSC_OFFLOAD_GPU;
4643   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4644   /* shorter version of MatAssemblyEnd_SeqAIJ */
4645   PetscCall(PetscInfo(A,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",A->rmap->n,A->cmap->n,a->nz));
4646   PetscCall(PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n"));
4647   PetscCall(PetscInfo(A,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",a->rmax));
4648   a->reallocs         = 0;
4649   A->info.mallocs    += 0;
4650   A->info.nz_unneeded = 0;
4651   A->assembled = A->was_assembled = PETSC_TRUE;
4652   A->num_ass++;
4653   PetscFunctionReturn(0);
4654 }
4655 
4656 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
4657 {
4658   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4659 
4660   PetscFunctionBegin;
4661   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4662   if (!cusp) PetscFunctionReturn(0);
4663   if (destroy) {
4664     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format));
4665     delete cusp->csr2csc_i;
4666     cusp->csr2csc_i = NULL;
4667   }
4668   A->transupdated = PETSC_FALSE;
4669   PetscFunctionReturn(0);
4670 }
4671 
4672 #include <thrust/binary_search.h>
4673 /* 'Basic' means it only works when coo_i[] and coo_j[] do not contain negative indices */
4674 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(Mat A, PetscCount n, const PetscInt coo_i[], const PetscInt coo_j[])
4675 {
4676   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4677   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
4678   PetscInt           cooPerm_n, nzr = 0;
4679 
4680   PetscFunctionBegin;
4681   PetscCall(PetscLayoutSetUp(A->rmap));
4682   PetscCall(PetscLayoutSetUp(A->cmap));
4683   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
4684   if (n != cooPerm_n) {
4685     delete cusp->cooPerm;
4686     delete cusp->cooPerm_a;
4687     cusp->cooPerm = NULL;
4688     cusp->cooPerm_a = NULL;
4689   }
4690   if (n) {
4691     THRUSTINTARRAY d_i(n);
4692     THRUSTINTARRAY d_j(n);
4693     THRUSTINTARRAY ii(A->rmap->n);
4694 
4695     if (!cusp->cooPerm)   { cusp->cooPerm   = new THRUSTINTARRAY(n); }
4696     if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); }
4697 
4698     PetscCall(PetscLogCpuToGpu(2.*n*sizeof(PetscInt)));
4699     d_i.assign(coo_i,coo_i+n);
4700     d_j.assign(coo_j,coo_j+n);
4701 
4702     /* Ex.
4703       n = 6
4704       coo_i = [3,3,1,4,1,4]
4705       coo_j = [3,2,2,5,2,6]
4706     */
4707     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin()));
4708     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end()));
4709 
4710     PetscCall(PetscLogGpuTimeBegin());
4711     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
4712     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */
4713     *cusp->cooPerm_a = d_i; /* copy the sorted array */
4714     THRUSTINTARRAY w = d_j;
4715 
4716     /*
4717       d_i     = [1,1,3,3,4,4]
4718       d_j     = [2,2,2,3,5,6]
4719       cooPerm = [2,4,1,0,3,5]
4720     */
4721     auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */
4722 
4723     /*
4724       d_i     = [1,3,3,4,4,x]
4725                             ^ekey
4726       d_j     = [2,2,3,5,6,x]
4727                            ^nekye
4728     */
4729     if (nekey == ekey) { /* all entries are unique */
4730       delete cusp->cooPerm_a;
4731       cusp->cooPerm_a = NULL;
4732     } else { /* Stefano: I couldn't come up with a more elegant algorithm */
4733       /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */
4734       adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/
4735       adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff());                                              /* w:         [2,2,2,3,5,6] => [2,0,0,1,1,1]*/
4736       (*cusp->cooPerm_a)[0] = 0; /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */
4737       w[0] = 0;
4738       thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); /* cooPerm_a =          [0,0,1,1,1,1]*/
4739       thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/
4740     }
4741     thrust::counting_iterator<PetscInt> search_begin(0);
4742     thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */
4743                         search_begin, search_begin + A->rmap->n,  /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */
4744                         ii.begin()); /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */
4745     PetscCall(PetscLogGpuTimeEnd());
4746 
4747     PetscCall(MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i));
4748     a->singlemalloc = PETSC_FALSE;
4749     a->free_a       = PETSC_TRUE;
4750     a->free_ij      = PETSC_TRUE;
4751     PetscCall(PetscMalloc1(A->rmap->n+1,&a->i));
4752     a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */
4753     PetscCallCUDA(cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost));
4754     a->nz = a->maxnz = a->i[A->rmap->n];
4755     a->rmax = 0;
4756     PetscCall(PetscMalloc1(a->nz,&a->a));
4757     PetscCall(PetscMalloc1(a->nz,&a->j));
4758     PetscCallCUDA(cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost));
4759     if (!a->ilen) PetscCall(PetscMalloc1(A->rmap->n,&a->ilen));
4760     if (!a->imax) PetscCall(PetscMalloc1(A->rmap->n,&a->imax));
4761     for (PetscInt i = 0; i < A->rmap->n; i++) {
4762       const PetscInt nnzr = a->i[i+1] - a->i[i];
4763       nzr += (PetscInt)!!(nnzr);
4764       a->ilen[i] = a->imax[i] = nnzr;
4765       a->rmax = PetscMax(a->rmax,nnzr);
4766     }
4767     a->nonzerorowcnt = nzr;
4768     A->preallocated = PETSC_TRUE;
4769     PetscCall(PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt)));
4770     PetscCall(MatMarkDiagonal_SeqAIJ(A));
4771   } else {
4772     PetscCall(MatSeqAIJSetPreallocation(A,0,NULL));
4773   }
4774   PetscCall(MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE));
4775 
4776   /* We want to allocate the CUSPARSE struct for matvec now.
4777      The code is so convoluted now that I prefer to copy zeros */
4778   PetscCall(PetscArrayzero(a->a,a->nz));
4779   PetscCall(MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6));
4780   A->offloadmask = PETSC_OFFLOAD_CPU;
4781   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4782   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE));
4783   PetscFunctionReturn(0);
4784 }
4785 
4786 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[])
4787 {
4788   Mat_SeqAIJ         *seq;
4789   Mat_SeqAIJCUSPARSE *dev;
4790   PetscBool          coo_basic = PETSC_TRUE;
4791   PetscMemType       mtype = PETSC_MEMTYPE_DEVICE;
4792 
4793   PetscFunctionBegin;
4794   PetscCall(MatResetPreallocationCOO_SeqAIJ(mat));
4795   PetscCall(MatResetPreallocationCOO_SeqAIJCUSPARSE(mat));
4796   if (coo_i) {
4797     PetscCall(PetscGetMemType(coo_i,&mtype));
4798     if (PetscMemTypeHost(mtype)) {
4799       for (PetscCount k=0; k<coo_n; k++) {
4800         if (coo_i[k] < 0 || coo_j[k] < 0) {coo_basic = PETSC_FALSE; break;}
4801       }
4802     }
4803   }
4804 
4805   if (coo_basic) { /* i,j are on device or do not contain negative indices */
4806     PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(mat,coo_n,coo_i,coo_j));
4807   } else {
4808     PetscCall(MatSetPreallocationCOO_SeqAIJ(mat,coo_n,coo_i,coo_j));
4809     mat->offloadmask = PETSC_OFFLOAD_CPU;
4810     PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat));
4811     seq  = static_cast<Mat_SeqAIJ*>(mat->data);
4812     dev  = static_cast<Mat_SeqAIJCUSPARSE*>(mat->spptr);
4813     PetscCallCUDA(cudaMalloc((void**)&dev->jmap_d,(seq->nz+1)*sizeof(PetscCount)));
4814     PetscCallCUDA(cudaMemcpy(dev->jmap_d,seq->jmap,(seq->nz+1)*sizeof(PetscCount),cudaMemcpyHostToDevice));
4815     PetscCallCUDA(cudaMalloc((void**)&dev->perm_d,seq->Atot*sizeof(PetscCount)));
4816     PetscCallCUDA(cudaMemcpy(dev->perm_d,seq->perm,seq->Atot*sizeof(PetscCount),cudaMemcpyHostToDevice));
4817     dev->use_extended_coo = PETSC_TRUE;
4818   }
4819   PetscFunctionReturn(0);
4820 }
4821 
4822 __global__ static void MatAddCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount jmap[],const PetscCount perm[],InsertMode imode,PetscScalar a[])
4823 {
4824   PetscCount        i = blockIdx.x*blockDim.x + threadIdx.x;
4825   const PetscCount  grid_size = gridDim.x * blockDim.x;
4826   for (; i<nnz; i+= grid_size) {
4827     PetscScalar sum = 0.0;
4828     for (PetscCount k=jmap[i]; k<jmap[i+1]; k++) sum += kv[perm[k]];
4829     a[i] = (imode == INSERT_VALUES? 0.0 : a[i]) + sum;
4830   }
4831 }
4832 
4833 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
4834 {
4835   Mat_SeqAIJ          *seq = (Mat_SeqAIJ*)A->data;
4836   Mat_SeqAIJCUSPARSE  *dev = (Mat_SeqAIJCUSPARSE*)A->spptr;
4837   PetscCount          Annz = seq->nz;
4838   PetscMemType        memtype;
4839   const PetscScalar   *v1 = v;
4840   PetscScalar         *Aa;
4841 
4842   PetscFunctionBegin;
4843   if (dev->use_extended_coo) {
4844     PetscCall(PetscGetMemType(v,&memtype));
4845     if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
4846       PetscCallCUDA(cudaMalloc((void**)&v1,seq->coo_n*sizeof(PetscScalar)));
4847       PetscCallCUDA(cudaMemcpy((void*)v1,v,seq->coo_n*sizeof(PetscScalar),cudaMemcpyHostToDevice));
4848     }
4849 
4850     if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A,&Aa));
4851     else PetscCall(MatSeqAIJCUSPARSEGetArray(A,&Aa));
4852 
4853     if (Annz) {
4854       MatAddCOOValues<<<(Annz+255)/256,256>>>(v1,Annz,dev->jmap_d,dev->perm_d,imode,Aa);
4855       PetscCallCUDA(cudaPeekAtLastError());
4856     }
4857 
4858     if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A,&Aa));
4859     else PetscCall(MatSeqAIJCUSPARSERestoreArray(A,&Aa));
4860 
4861     if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void*)v1));
4862   } else {
4863     PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(A,v,imode));
4864   }
4865   PetscFunctionReturn(0);
4866 }
4867 
4868 /*@C
4869     MatSeqAIJCUSPARSEGetIJ - returns the device row storage i and j indices for MATSEQAIJCUSPARSE matrices.
4870 
4871    Not collective
4872 
4873     Input Parameters:
4874 +   A - the matrix
4875 -   compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form
4876 
4877     Output Parameters:
4878 +   ia - the CSR row pointers
4879 -   ja - the CSR column indices
4880 
4881     Level: developer
4882 
4883     Notes:
4884       When compressed is true, the CSR structure does not contain empty rows
4885 
4886 .seealso: `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()`
4887 @*/
4888 PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int** i, const int **j)
4889 {
4890   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4891   CsrMatrix          *csr;
4892   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
4893 
4894   PetscFunctionBegin;
4895   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4896   if (!i || !j) PetscFunctionReturn(0);
4897   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4898   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4899   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4900   PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4901   csr = (CsrMatrix*)cusp->mat->mat;
4902   if (i) {
4903     if (!compressed && a->compressedrow.use) { /* need full row offset */
4904       if (!cusp->rowoffsets_gpu) {
4905         cusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
4906         cusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
4907         PetscCall(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt)));
4908       }
4909       *i = cusp->rowoffsets_gpu->data().get();
4910     } else *i = csr->row_offsets->data().get();
4911   }
4912   if (j) *j = csr->column_indices->data().get();
4913   PetscFunctionReturn(0);
4914 }
4915 
4916 /*@C
4917     MatSeqAIJCUSPARSERestoreIJ - restore the device row storage i and j indices obtained with MatSeqAIJCUSPARSEGetIJ()
4918 
4919    Not collective
4920 
4921     Input Parameters:
4922 +   A - the matrix
4923 -   compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form
4924 
4925     Output Parameters:
4926 +   ia - the CSR row pointers
4927 -   ja - the CSR column indices
4928 
4929     Level: developer
4930 
4931 .seealso: `MatSeqAIJCUSPARSEGetIJ()`
4932 @*/
4933 PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int** i, const int **j)
4934 {
4935   PetscFunctionBegin;
4936   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4937   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4938   if (i) *i = NULL;
4939   if (j) *j = NULL;
4940   PetscFunctionReturn(0);
4941 }
4942 
4943 /*@C
4944    MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
4945 
4946    Not Collective
4947 
4948    Input Parameter:
4949 .   A - a MATSEQAIJCUSPARSE matrix
4950 
4951    Output Parameter:
4952 .   a - pointer to the device data
4953 
4954    Level: developer
4955 
4956    Notes: may trigger host-device copies if up-to-date matrix data is on host
4957 
4958 .seealso: `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()`
4959 @*/
4960 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a)
4961 {
4962   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4963   CsrMatrix          *csr;
4964 
4965   PetscFunctionBegin;
4966   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4967   PetscValidPointer(a,2);
4968   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4969   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4970   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4971   PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4972   csr = (CsrMatrix*)cusp->mat->mat;
4973   PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4974   *a = csr->values->data().get();
4975   PetscFunctionReturn(0);
4976 }
4977 
4978 /*@C
4979    MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from MatSeqAIJCUSPARSEGetArrayRead()
4980 
4981    Not Collective
4982 
4983    Input Parameter:
4984 .   A - a MATSEQAIJCUSPARSE matrix
4985 
4986    Output Parameter:
4987 .   a - pointer to the device data
4988 
4989    Level: developer
4990 
4991 .seealso: `MatSeqAIJCUSPARSEGetArrayRead()`
4992 @*/
4993 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a)
4994 {
4995   PetscFunctionBegin;
4996   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4997   PetscValidPointer(a,2);
4998   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4999   *a = NULL;
5000   PetscFunctionReturn(0);
5001 }
5002 
5003 /*@C
5004    MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
5005 
5006    Not Collective
5007 
5008    Input Parameter:
5009 .   A - a MATSEQAIJCUSPARSE matrix
5010 
5011    Output Parameter:
5012 .   a - pointer to the device data
5013 
5014    Level: developer
5015 
5016    Notes: may trigger host-device copies if up-to-date matrix data is on host
5017 
5018 .seealso: `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()`
5019 @*/
5020 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a)
5021 {
5022   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
5023   CsrMatrix          *csr;
5024 
5025   PetscFunctionBegin;
5026   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
5027   PetscValidPointer(a,2);
5028   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
5029   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
5030   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
5031   PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
5032   csr = (CsrMatrix*)cusp->mat->mat;
5033   PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
5034   *a = csr->values->data().get();
5035   A->offloadmask = PETSC_OFFLOAD_GPU;
5036   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE));
5037   PetscFunctionReturn(0);
5038 }
5039 /*@C
5040    MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from MatSeqAIJCUSPARSEGetArray()
5041 
5042    Not Collective
5043 
5044    Input Parameter:
5045 .   A - a MATSEQAIJCUSPARSE matrix
5046 
5047    Output Parameter:
5048 .   a - pointer to the device data
5049 
5050    Level: developer
5051 
5052 .seealso: `MatSeqAIJCUSPARSEGetArray()`
5053 @*/
5054 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a)
5055 {
5056   PetscFunctionBegin;
5057   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
5058   PetscValidPointer(a,2);
5059   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
5060   PetscCall(MatSeqAIJInvalidateDiagonal(A));
5061   PetscCall(PetscObjectStateIncrease((PetscObject)A));
5062   *a = NULL;
5063   PetscFunctionReturn(0);
5064 }
5065 
5066 /*@C
5067    MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
5068 
5069    Not Collective
5070 
5071    Input Parameter:
5072 .   A - a MATSEQAIJCUSPARSE matrix
5073 
5074    Output Parameter:
5075 .   a - pointer to the device data
5076 
5077    Level: developer
5078 
5079    Notes: does not trigger host-device copies and flags data validity on the GPU
5080 
5081 .seealso: `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()`
5082 @*/
5083 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a)
5084 {
5085   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
5086   CsrMatrix          *csr;
5087 
5088   PetscFunctionBegin;
5089   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
5090   PetscValidPointer(a,2);
5091   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
5092   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
5093   PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
5094   csr = (CsrMatrix*)cusp->mat->mat;
5095   PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
5096   *a = csr->values->data().get();
5097   A->offloadmask = PETSC_OFFLOAD_GPU;
5098   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE));
5099   PetscFunctionReturn(0);
5100 }
5101 
5102 /*@C
5103    MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from MatSeqAIJCUSPARSEGetArrayWrite()
5104 
5105    Not Collective
5106 
5107    Input Parameter:
5108 .   A - a MATSEQAIJCUSPARSE matrix
5109 
5110    Output Parameter:
5111 .   a - pointer to the device data
5112 
5113    Level: developer
5114 
5115 .seealso: `MatSeqAIJCUSPARSEGetArrayWrite()`
5116 @*/
5117 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a)
5118 {
5119   PetscFunctionBegin;
5120   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
5121   PetscValidPointer(a,2);
5122   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
5123   PetscCall(MatSeqAIJInvalidateDiagonal(A));
5124   PetscCall(PetscObjectStateIncrease((PetscObject)A));
5125   *a = NULL;
5126   PetscFunctionReturn(0);
5127 }
5128 
5129 struct IJCompare4
5130 {
5131   __host__ __device__
5132   inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
5133   {
5134     if (t1.get<0>() < t2.get<0>()) return true;
5135     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
5136     return false;
5137   }
5138 };
5139 
5140 struct Shift
5141 {
5142   int _shift;
5143 
5144   Shift(int shift) : _shift(shift) {}
5145   __host__ __device__
5146   inline int operator() (const int &c)
5147   {
5148     return c + _shift;
5149   }
5150 };
5151 
5152 /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */
5153 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
5154 {
5155   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c;
5156   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp;
5157   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
5158   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
5159   PetscInt                     Annz,Bnnz;
5160   cusparseStatus_t             stat;
5161   PetscInt                     i,m,n,zero = 0;
5162 
5163   PetscFunctionBegin;
5164   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
5165   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
5166   PetscValidPointer(C,4);
5167   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
5168   PetscCheckTypeName(B,MATSEQAIJCUSPARSE);
5169   PetscCheck(A->rmap->n == B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n);
5170   PetscCheck(reuse != MAT_INPLACE_MATRIX,PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
5171   PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
5172   PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
5173   if (reuse == MAT_INITIAL_MATRIX) {
5174     m     = A->rmap->n;
5175     n     = A->cmap->n + B->cmap->n;
5176     PetscCall(MatCreate(PETSC_COMM_SELF,C));
5177     PetscCall(MatSetSizes(*C,m,n,m,n));
5178     PetscCall(MatSetType(*C,MATSEQAIJCUSPARSE));
5179     c     = (Mat_SeqAIJ*)(*C)->data;
5180     Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
5181     Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
5182     Ccsr  = new CsrMatrix;
5183     Cmat->cprowIndices      = NULL;
5184     c->compressedrow.use    = PETSC_FALSE;
5185     c->compressedrow.nrows  = 0;
5186     c->compressedrow.i      = NULL;
5187     c->compressedrow.rindex = NULL;
5188     Ccusp->workVector       = NULL;
5189     Ccusp->nrows    = m;
5190     Ccusp->mat      = Cmat;
5191     Ccusp->mat->mat = Ccsr;
5192     Ccsr->num_rows  = m;
5193     Ccsr->num_cols  = n;
5194     PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
5195     PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
5196     PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
5197     PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar)));
5198     PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar)));
5199     PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar)));
5200     PetscCallCUDA(cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
5201     PetscCallCUDA(cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice));
5202     PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
5203     PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
5204     PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
5205     PetscCheck(Acusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
5206     PetscCheck(Bcusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
5207 
5208     Acsr = (CsrMatrix*)Acusp->mat->mat;
5209     Bcsr = (CsrMatrix*)Bcusp->mat->mat;
5210     Annz = (PetscInt)Acsr->column_indices->size();
5211     Bnnz = (PetscInt)Bcsr->column_indices->size();
5212     c->nz = Annz + Bnnz;
5213     Ccsr->row_offsets = new THRUSTINTARRAY32(m+1);
5214     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
5215     Ccsr->values = new THRUSTARRAY(c->nz);
5216     Ccsr->num_entries = c->nz;
5217     Ccusp->cooPerm = new THRUSTINTARRAY(c->nz);
5218     if (c->nz) {
5219       auto Acoo = new THRUSTINTARRAY32(Annz);
5220       auto Bcoo = new THRUSTINTARRAY32(Bnnz);
5221       auto Ccoo = new THRUSTINTARRAY32(c->nz);
5222       THRUSTINTARRAY32 *Aroff,*Broff;
5223 
5224       if (a->compressedrow.use) { /* need full row offset */
5225         if (!Acusp->rowoffsets_gpu) {
5226           Acusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
5227           Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
5228           PetscCall(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt)));
5229         }
5230         Aroff = Acusp->rowoffsets_gpu;
5231       } else Aroff = Acsr->row_offsets;
5232       if (b->compressedrow.use) { /* need full row offset */
5233         if (!Bcusp->rowoffsets_gpu) {
5234           Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
5235           Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
5236           PetscCall(PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt)));
5237         }
5238         Broff = Bcusp->rowoffsets_gpu;
5239       } else Broff = Bcsr->row_offsets;
5240       PetscCall(PetscLogGpuTimeBegin());
5241       stat = cusparseXcsr2coo(Acusp->handle,
5242                               Aroff->data().get(),
5243                               Annz,
5244                               m,
5245                               Acoo->data().get(),
5246                               CUSPARSE_INDEX_BASE_ZERO);PetscCallCUSPARSE(stat);
5247       stat = cusparseXcsr2coo(Bcusp->handle,
5248                               Broff->data().get(),
5249                               Bnnz,
5250                               m,
5251                               Bcoo->data().get(),
5252                               CUSPARSE_INDEX_BASE_ZERO);PetscCallCUSPARSE(stat);
5253       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
5254       auto Aperm = thrust::make_constant_iterator(1);
5255       auto Bperm = thrust::make_constant_iterator(0);
5256 #if PETSC_PKG_CUDA_VERSION_GE(10,0,0)
5257       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n));
5258       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n));
5259 #else
5260       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
5261       auto Bcib = Bcsr->column_indices->begin();
5262       auto Bcie = Bcsr->column_indices->end();
5263       thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n));
5264 #endif
5265       auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz);
5266       auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm));
5267       auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm));
5268       auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm));
5269       auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm));
5270       auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin()));
5271       auto p1 = Ccusp->cooPerm->begin();
5272       auto p2 = Ccusp->cooPerm->begin();
5273       thrust::advance(p2,Annz);
5274       PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4()));
5275 #if PETSC_PKG_CUDA_VERSION_LT(10,0,0)
5276       thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n));
5277 #endif
5278       auto cci = thrust::make_counting_iterator(zero);
5279       auto cce = thrust::make_counting_iterator(c->nz);
5280 #if 0 //Errors on SUMMIT cuda 11.1.0
5281       PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
5282 #else
5283       auto pred = thrust::identity<int>();
5284       PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred));
5285       PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred));
5286 #endif
5287       stat = cusparseXcoo2csr(Ccusp->handle,
5288                               Ccoo->data().get(),
5289                               c->nz,
5290                               m,
5291                               Ccsr->row_offsets->data().get(),
5292                               CUSPARSE_INDEX_BASE_ZERO);PetscCallCUSPARSE(stat);
5293       PetscCall(PetscLogGpuTimeEnd());
5294       delete wPerm;
5295       delete Acoo;
5296       delete Bcoo;
5297       delete Ccoo;
5298 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
5299       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries,
5300                                Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(),
5301                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
5302                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat);
5303 #endif
5304       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
5305         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
5306         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
5307         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
5308         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
5309         CsrMatrix *CcsrT = new CsrMatrix;
5310         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
5311         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
5312 
5313         (*C)->form_explicit_transpose = PETSC_TRUE;
5314         (*C)->transupdated = PETSC_TRUE;
5315         Ccusp->rowoffsets_gpu = NULL;
5316         CmatT->cprowIndices = NULL;
5317         CmatT->mat = CcsrT;
5318         CcsrT->num_rows = n;
5319         CcsrT->num_cols = m;
5320         CcsrT->num_entries = c->nz;
5321 
5322         CcsrT->row_offsets = new THRUSTINTARRAY32(n+1);
5323         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
5324         CcsrT->values = new THRUSTARRAY(c->nz);
5325 
5326         PetscCall(PetscLogGpuTimeBegin());
5327         auto rT = CcsrT->row_offsets->begin();
5328         if (AT) {
5329           rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT);
5330           thrust::advance(rT,-1);
5331         }
5332         if (BT) {
5333           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz));
5334           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz));
5335           thrust::copy(titb,tite,rT);
5336         }
5337         auto cT = CcsrT->column_indices->begin();
5338         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT);
5339         if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT);
5340         auto vT = CcsrT->values->begin();
5341         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
5342         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
5343         PetscCall(PetscLogGpuTimeEnd());
5344 
5345         PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr));
5346         PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO));
5347         PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
5348         PetscCallCUDA(cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar)));
5349         PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar)));
5350         PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar)));
5351         PetscCallCUDA(cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
5352         PetscCallCUDA(cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice));
5353         PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
5354 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
5355         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries,
5356                                  CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(),
5357                                  CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
5358                                  CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat);
5359 #endif
5360         Ccusp->matTranspose = CmatT;
5361       }
5362     }
5363 
5364     c->singlemalloc = PETSC_FALSE;
5365     c->free_a       = PETSC_TRUE;
5366     c->free_ij      = PETSC_TRUE;
5367     PetscCall(PetscMalloc1(m+1,&c->i));
5368     PetscCall(PetscMalloc1(c->nz,&c->j));
5369     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
5370       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
5371       THRUSTINTARRAY jj(Ccsr->column_indices->size());
5372       ii   = *Ccsr->row_offsets;
5373       jj   = *Ccsr->column_indices;
5374       PetscCallCUDA(cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
5375       PetscCallCUDA(cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
5376     } else {
5377       PetscCallCUDA(cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
5378       PetscCallCUDA(cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
5379     }
5380     PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt)));
5381     PetscCall(PetscMalloc1(m,&c->ilen));
5382     PetscCall(PetscMalloc1(m,&c->imax));
5383     c->maxnz = c->nz;
5384     c->nonzerorowcnt = 0;
5385     c->rmax = 0;
5386     for (i = 0; i < m; i++) {
5387       const PetscInt nn = c->i[i+1] - c->i[i];
5388       c->ilen[i] = c->imax[i] = nn;
5389       c->nonzerorowcnt += (PetscInt)!!nn;
5390       c->rmax = PetscMax(c->rmax,nn);
5391     }
5392     PetscCall(MatMarkDiagonal_SeqAIJ(*C));
5393     PetscCall(PetscMalloc1(c->nz,&c->a));
5394     (*C)->nonzerostate++;
5395     PetscCall(PetscLayoutSetUp((*C)->rmap));
5396     PetscCall(PetscLayoutSetUp((*C)->cmap));
5397     Ccusp->nonzerostate = (*C)->nonzerostate;
5398     (*C)->preallocated  = PETSC_TRUE;
5399   } else {
5400     PetscCheck((*C)->rmap->n == B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,(*C)->rmap->n,B->rmap->n);
5401     c = (Mat_SeqAIJ*)(*C)->data;
5402     if (c->nz) {
5403       Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
5404       PetscCheck(Ccusp->cooPerm,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm");
5405       PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
5406       PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate,PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate");
5407       PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
5408       PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
5409       PetscCheck(Acusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
5410       PetscCheck(Bcusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
5411       Acsr = (CsrMatrix*)Acusp->mat->mat;
5412       Bcsr = (CsrMatrix*)Bcusp->mat->mat;
5413       Ccsr = (CsrMatrix*)Ccusp->mat->mat;
5414       PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %" PetscInt_FMT " != %" PetscInt_FMT,Acsr->num_entries,(PetscInt)Acsr->values->size());
5415       PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %" PetscInt_FMT " != %" PetscInt_FMT,Bcsr->num_entries,(PetscInt)Bcsr->values->size());
5416       PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %" PetscInt_FMT " != %" PetscInt_FMT,Ccsr->num_entries,(PetscInt)Ccsr->values->size());
5417       PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries,PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT,Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries);
5418       PetscCheck(Ccusp->cooPerm->size() == Ccsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %" PetscInt_FMT " != %" PetscInt_FMT,(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size());
5419       auto pmid = Ccusp->cooPerm->begin();
5420       thrust::advance(pmid,Acsr->num_entries);
5421       PetscCall(PetscLogGpuTimeBegin());
5422       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(),
5423                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin())));
5424       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(),
5425                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
5426       thrust::for_each(zibait,zieait,VecCUDAEquals());
5427       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(),
5428                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
5429       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(),
5430                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end())));
5431       thrust::for_each(zibbit,ziebit,VecCUDAEquals());
5432       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE));
5433       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
5434         PetscCheck(Ccusp->matTranspose,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
5435         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
5436         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
5437         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
5438         CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat;
5439         auto vT = CcsrT->values->begin();
5440         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
5441         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
5442         (*C)->transupdated = PETSC_TRUE;
5443       }
5444       PetscCall(PetscLogGpuTimeEnd());
5445     }
5446   }
5447   PetscCall(PetscObjectStateIncrease((PetscObject)*C));
5448   (*C)->assembled     = PETSC_TRUE;
5449   (*C)->was_assembled = PETSC_FALSE;
5450   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
5451   PetscFunctionReturn(0);
5452 }
5453 
5454 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
5455 {
5456   bool              dmem;
5457   const PetscScalar *av;
5458 
5459   PetscFunctionBegin;
5460   dmem = isCudaMem(v);
5461   PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A,&av));
5462   if (n && idx) {
5463     THRUSTINTARRAY widx(n);
5464     widx.assign(idx,idx+n);
5465     PetscCall(PetscLogCpuToGpu(n*sizeof(PetscInt)));
5466 
5467     THRUSTARRAY *w = NULL;
5468     thrust::device_ptr<PetscScalar> dv;
5469     if (dmem) {
5470       dv = thrust::device_pointer_cast(v);
5471     } else {
5472       w = new THRUSTARRAY(n);
5473       dv = w->data();
5474     }
5475     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
5476 
5477     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv));
5478     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n));
5479     thrust::for_each(zibit,zieit,VecCUDAEquals());
5480     if (w) {
5481       PetscCallCUDA(cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost));
5482     }
5483     delete w;
5484   } else {
5485     PetscCallCUDA(cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost));
5486   }
5487   if (!dmem) PetscCall(PetscLogCpuToGpu(n*sizeof(PetscScalar)));
5488   PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A,&av));
5489   PetscFunctionReturn(0);
5490 }
5491