xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision da112707c025931abfd1f63dd32a919f49da62ba)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
7 
8 #include <petscconf.h>
9 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
10 #include <../src/mat/impls/sbaij/seq/sbaij.h>
11 #include <../src/vec/vec/impls/dvecimpl.h>
12 #include <petsc/private/vecimpl.h>
13 #undef VecType
14 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
15 #include <thrust/adjacent_difference.h>
16 #include <thrust/async/for_each.h>
17 #include <thrust/iterator/constant_iterator.h>
18 #include <thrust/remove.h>
19 #include <thrust/sort.h>
20 #include <thrust/unique.h>
21 
22 const char *const MatCUSPARSEStorageFormats[]    = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
23 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
24   /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
25     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
26 
27   typedef enum {
28       CUSPARSE_MV_ALG_DEFAULT = 0,
29       CUSPARSE_COOMV_ALG      = 1,
30       CUSPARSE_CSRMV_ALG1     = 2,
31       CUSPARSE_CSRMV_ALG2     = 3
32   } cusparseSpMVAlg_t;
33 
34   typedef enum {
35       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
36       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
37       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
38       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
39       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
40       CUSPARSE_SPMM_ALG_DEFAULT = 0,
41       CUSPARSE_SPMM_COO_ALG1    = 1,
42       CUSPARSE_SPMM_COO_ALG2    = 2,
43       CUSPARSE_SPMM_COO_ALG3    = 3,
44       CUSPARSE_SPMM_COO_ALG4    = 5,
45       CUSPARSE_SPMM_CSR_ALG1    = 4,
46       CUSPARSE_SPMM_CSR_ALG2    = 6,
47   } cusparseSpMMAlg_t;
48 
49   typedef enum {
50       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
51       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
52   } cusparseCsr2CscAlg_t;
53   */
54   const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
55   const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
56   const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
57 #endif
58 
59 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
60 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
61 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
62 
63 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
64 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
65 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
66 
67 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
68 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
69 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
70 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
71 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
72 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure);
73 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar);
74 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
75 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
76 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
77 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
78 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
79 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
80 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
81 
82 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
83 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
84 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
85 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
86 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
87 
88 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
89 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool);
90 
91 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]);
92 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscCount,const PetscInt[],const PetscInt[]);
93 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode);
94 
95 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
96 {
97   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
98 
99   PetscFunctionBegin;
100   switch (op) {
101   case MAT_CUSPARSE_MULT:
102     cusparsestruct->format = format;
103     break;
104   case MAT_CUSPARSE_ALL:
105     cusparsestruct->format = format;
106     break;
107   default:
108     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
109   }
110   PetscFunctionReturn(0);
111 }
112 
113 /*@
114    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
115    operation. Only the MatMult operation can use different GPU storage formats
116    for MPIAIJCUSPARSE matrices.
117    Not Collective
118 
119    Input Parameters:
120 +  A - Matrix of type SEQAIJCUSPARSE
121 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
122 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
123 
124    Output Parameter:
125 
126    Level: intermediate
127 
128 .seealso: `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
129 @*/
130 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
131 {
132   PetscFunctionBegin;
133   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
134   PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));
135   PetscFunctionReturn(0);
136 }
137 
138 PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A,PetscBool use_cpu)
139 {
140   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
141 
142   PetscFunctionBegin;
143   cusparsestruct->use_cpu_solve = use_cpu;
144   PetscFunctionReturn(0);
145 }
146 
147 /*@
148    MatCUSPARSESetUseCPUSolve - Sets use CPU MatSolve.
149 
150    Input Parameters:
151 +  A - Matrix of type SEQAIJCUSPARSE
152 -  use_cpu - set flag for using the built-in CPU MatSolve
153 
154    Output Parameter:
155 
156    Notes:
157    The cuSparse LU solver currently computes the factors with the built-in CPU method
158    and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there.
159    This method to specify if the solve is done on the CPU or GPU (GPU is the default).
160 
161    Level: intermediate
162 
163 .seealso: `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
164 @*/
165 PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A,PetscBool use_cpu)
166 {
167   PetscFunctionBegin;
168   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
169   PetscTryMethod(A,"MatCUSPARSESetUseCPUSolve_C",(Mat,PetscBool),(A,use_cpu));
170   PetscFunctionReturn(0);
171 }
172 
173 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg)
174 {
175   PetscFunctionBegin;
176   switch (op) {
177     case MAT_FORM_EXPLICIT_TRANSPOSE:
178       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
179       if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE));
180       A->form_explicit_transpose = flg;
181       break;
182     default:
183       PetscCall(MatSetOption_SeqAIJ(A,op,flg));
184       break;
185   }
186   PetscFunctionReturn(0);
187 }
188 
189 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A);
190 
191 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
192 {
193   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
194   IS             isrow = b->row,iscol = b->col;
195   PetscBool      row_identity,col_identity;
196   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)B->spptr;
197 
198   PetscFunctionBegin;
199   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
200   PetscCall(MatLUFactorNumeric_SeqAIJ(B,A,info));
201   B->offloadmask = PETSC_OFFLOAD_CPU;
202   /* determine which version of MatSolve needs to be used. */
203   PetscCall(ISIdentity(isrow,&row_identity));
204   PetscCall(ISIdentity(iscol,&col_identity));
205 
206   if (!cusparsestruct->use_cpu_solve) {
207     if (row_identity && col_identity) {
208       B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
209       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
210     } else {
211       B->ops->solve = MatSolve_SeqAIJCUSPARSE;
212       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
213     }
214   }
215   B->ops->matsolve = NULL;
216   B->ops->matsolvetranspose = NULL;
217 
218   /* get the triangular factors */
219   if (!cusparsestruct->use_cpu_solve) {
220     PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B));
221   }
222   PetscFunctionReturn(0);
223 }
224 
225 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
226 {
227   MatCUSPARSEStorageFormat format;
228   PetscBool                flg;
229   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
230 
231   PetscFunctionBegin;
232   PetscOptionsHeadBegin(PetscOptionsObject,"SeqAIJCUSPARSE options");
233   if (A->factortype == MAT_FACTOR_NONE) {
234     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
235                                "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg));
236     if (flg) PetscCall(MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format));
237 
238     PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
239                                "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg));
240     if (flg) PetscCall(MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format));
241     PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve","Use CPU (I)LU solve","MatCUSPARSESetUseCPUSolve",cusparsestruct->use_cpu_solve,&cusparsestruct->use_cpu_solve,&flg));
242     if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A,cusparsestruct->use_cpu_solve));
243 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
244     PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
245                                "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg));
246     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
247 #if CUSPARSE_VERSION > 11301
248     PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
249 #else
250     PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
251 #endif
252     PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
253                                "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg));
254     PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
255 
256     PetscCall(PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
257                                "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg));
258     PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
259    #endif
260   }
261   PetscOptionsHeadEnd();
262   PetscFunctionReturn(0);
263 }
264 
265 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
266 {
267   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
268   PetscInt                          n = A->rmap->n;
269   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
270   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
271   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
272   const MatScalar                   *aa = a->a,*v;
273   PetscInt                          *AiLo, *AjLo;
274   PetscInt                          i,nz, nzLower, offset, rowOffset;
275 
276   PetscFunctionBegin;
277   if (!n) PetscFunctionReturn(0);
278   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
279     try {
280       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
281       nzLower=n+ai[n]-ai[1];
282       if (!loTriFactor) {
283         PetscScalar                       *AALo;
284 
285         PetscCallCUDA(cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar)));
286 
287         /* Allocate Space for the lower triangular matrix */
288         PetscCallCUDA(cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt)));
289         PetscCallCUDA(cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt)));
290 
291         /* Fill the lower triangular matrix */
292         AiLo[0]  = (PetscInt) 0;
293         AiLo[n]  = nzLower;
294         AjLo[0]  = (PetscInt) 0;
295         AALo[0]  = (MatScalar) 1.0;
296         v        = aa;
297         vi       = aj;
298         offset   = 1;
299         rowOffset= 1;
300         for (i=1; i<n; i++) {
301           nz = ai[i+1] - ai[i];
302           /* additional 1 for the term on the diagonal */
303           AiLo[i]    = rowOffset;
304           rowOffset += nz+1;
305 
306           PetscCall(PetscArraycpy(&(AjLo[offset]), vi, nz));
307           PetscCall(PetscArraycpy(&(AALo[offset]), v, nz));
308 
309           offset      += nz;
310           AjLo[offset] = (PetscInt) i;
311           AALo[offset] = (MatScalar) 1.0;
312           offset      += 1;
313 
314           v  += nz;
315           vi += nz;
316         }
317 
318         /* allocate space for the triangular factor information */
319         PetscCall(PetscNew(&loTriFactor));
320         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
321         /* Create the matrix description */
322         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
323         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
324        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
325         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
326        #else
327         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
328        #endif
329         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER));
330         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
331 
332         /* set the operation */
333         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
334 
335         /* set the matrix */
336         loTriFactor->csrMat = new CsrMatrix;
337         loTriFactor->csrMat->num_rows = n;
338         loTriFactor->csrMat->num_cols = n;
339         loTriFactor->csrMat->num_entries = nzLower;
340 
341         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
342         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
343 
344         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
345         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
346 
347         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
348         loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
349 
350         /* Create the solve analysis information */
351         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
352         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
353       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
354         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
355                                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
356                                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
357                                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
358                                                &loTriFactor->solveBufferSize));
359         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize));
360       #endif
361 
362         /* perform the solve analysis */
363         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
364                                          loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
365                                          loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
366                                          loTriFactor->csrMat->column_indices->data().get(),
367                                          #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
368                                          loTriFactor->solveInfo,
369                                          loTriFactor->solvePolicy, loTriFactor->solveBuffer));
370                                          #else
371                                          loTriFactor->solveInfo));
372                                          #endif
373         PetscCallCUDA(WaitForCUDA());
374         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
375 
376         /* assign the pointer */
377         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
378         loTriFactor->AA_h = AALo;
379         PetscCallCUDA(cudaFreeHost(AiLo));
380         PetscCallCUDA(cudaFreeHost(AjLo));
381         PetscCall(PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar)));
382       } else { /* update values only */
383         if (!loTriFactor->AA_h) {
384           PetscCallCUDA(cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar)));
385         }
386         /* Fill the lower triangular matrix */
387         loTriFactor->AA_h[0]  = 1.0;
388         v        = aa;
389         vi       = aj;
390         offset   = 1;
391         for (i=1; i<n; i++) {
392           nz = ai[i+1] - ai[i];
393           PetscCall(PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz));
394           offset      += nz;
395           loTriFactor->AA_h[offset] = 1.0;
396           offset      += 1;
397           v  += nz;
398         }
399         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower);
400         PetscCall(PetscLogCpuToGpu(nzLower*sizeof(PetscScalar)));
401       }
402     } catch(char *ex) {
403       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
404     }
405   }
406   PetscFunctionReturn(0);
407 }
408 
409 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
410 {
411   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
412   PetscInt                          n = A->rmap->n;
413   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
414   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
415   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
416   const MatScalar                   *aa = a->a,*v;
417   PetscInt                          *AiUp, *AjUp;
418   PetscInt                          i,nz, nzUpper, offset;
419 
420   PetscFunctionBegin;
421   if (!n) PetscFunctionReturn(0);
422   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
423     try {
424       /* next, figure out the number of nonzeros in the upper triangular matrix. */
425       nzUpper = adiag[0]-adiag[n];
426       if (!upTriFactor) {
427         PetscScalar *AAUp;
428 
429         PetscCallCUDA(cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar)));
430 
431         /* Allocate Space for the upper triangular matrix */
432         PetscCallCUDA(cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt)));
433         PetscCallCUDA(cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt)));
434 
435         /* Fill the upper triangular matrix */
436         AiUp[0]=(PetscInt) 0;
437         AiUp[n]=nzUpper;
438         offset = nzUpper;
439         for (i=n-1; i>=0; i--) {
440           v  = aa + adiag[i+1] + 1;
441           vi = aj + adiag[i+1] + 1;
442 
443           /* number of elements NOT on the diagonal */
444           nz = adiag[i] - adiag[i+1]-1;
445 
446           /* decrement the offset */
447           offset -= (nz+1);
448 
449           /* first, set the diagonal elements */
450           AjUp[offset] = (PetscInt) i;
451           AAUp[offset] = (MatScalar)1./v[nz];
452           AiUp[i]      = AiUp[i+1] - (nz+1);
453 
454           PetscCall(PetscArraycpy(&(AjUp[offset+1]), vi, nz));
455           PetscCall(PetscArraycpy(&(AAUp[offset+1]), v, nz));
456         }
457 
458         /* allocate space for the triangular factor information */
459         PetscCall(PetscNew(&upTriFactor));
460         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
461 
462         /* Create the matrix description */
463         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
464         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
465        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
466         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
467        #else
468         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
469        #endif
470         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
471         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
472 
473         /* set the operation */
474         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
475 
476         /* set the matrix */
477         upTriFactor->csrMat = new CsrMatrix;
478         upTriFactor->csrMat->num_rows = n;
479         upTriFactor->csrMat->num_cols = n;
480         upTriFactor->csrMat->num_entries = nzUpper;
481 
482         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
483         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
484 
485         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
486         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
487 
488         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
489         upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
490 
491         /* Create the solve analysis information */
492         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
493         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
494       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
495         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
496                                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
497                                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
498                                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
499                                                &upTriFactor->solveBufferSize));
500         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize));
501       #endif
502 
503         /* perform the solve analysis */
504         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
505                                          upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
506                                          upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
507                                          upTriFactor->csrMat->column_indices->data().get(),
508                                          #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
509                                          upTriFactor->solveInfo,
510                                          upTriFactor->solvePolicy, upTriFactor->solveBuffer));
511                                          #else
512                                          upTriFactor->solveInfo));
513                                          #endif
514         PetscCallCUDA(WaitForCUDA());
515         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
516 
517         /* assign the pointer */
518         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
519         upTriFactor->AA_h = AAUp;
520         PetscCallCUDA(cudaFreeHost(AiUp));
521         PetscCallCUDA(cudaFreeHost(AjUp));
522         PetscCall(PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar)));
523       } else {
524         if (!upTriFactor->AA_h) {
525           PetscCallCUDA(cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar)));
526         }
527         /* Fill the upper triangular matrix */
528         offset = nzUpper;
529         for (i=n-1; i>=0; i--) {
530           v  = aa + adiag[i+1] + 1;
531 
532           /* number of elements NOT on the diagonal */
533           nz = adiag[i] - adiag[i+1]-1;
534 
535           /* decrement the offset */
536           offset -= (nz+1);
537 
538           /* first, set the diagonal elements */
539           upTriFactor->AA_h[offset] = 1./v[nz];
540           PetscCall(PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz));
541         }
542         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper);
543         PetscCall(PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar)));
544       }
545     } catch(char *ex) {
546       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
547     }
548   }
549   PetscFunctionReturn(0);
550 }
551 
552 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
553 {
554   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
555   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
556   IS                           isrow = a->row,iscol = a->icol;
557   PetscBool                    row_identity,col_identity;
558   PetscInt                     n = A->rmap->n;
559 
560   PetscFunctionBegin;
561   PetscCheck(cusparseTriFactors,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
562   PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A));
563   PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A));
564 
565   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
566   cusparseTriFactors->nnz=a->nz;
567 
568   A->offloadmask = PETSC_OFFLOAD_BOTH;
569   /* lower triangular indices */
570   PetscCall(ISIdentity(isrow,&row_identity));
571   if (!row_identity && !cusparseTriFactors->rpermIndices) {
572     const PetscInt *r;
573 
574     PetscCall(ISGetIndices(isrow,&r));
575     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
576     cusparseTriFactors->rpermIndices->assign(r, r+n);
577     PetscCall(ISRestoreIndices(isrow,&r));
578     PetscCall(PetscLogCpuToGpu(n*sizeof(PetscInt)));
579   }
580 
581   /* upper triangular indices */
582   PetscCall(ISIdentity(iscol,&col_identity));
583   if (!col_identity && !cusparseTriFactors->cpermIndices) {
584     const PetscInt *c;
585 
586     PetscCall(ISGetIndices(iscol,&c));
587     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
588     cusparseTriFactors->cpermIndices->assign(c, c+n);
589     PetscCall(ISRestoreIndices(iscol,&c));
590     PetscCall(PetscLogCpuToGpu(n*sizeof(PetscInt)));
591   }
592   PetscFunctionReturn(0);
593 }
594 
595 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
596 {
597   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
598   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
599   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
600   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
601   PetscInt                          *AiUp, *AjUp;
602   PetscScalar                       *AAUp;
603   PetscScalar                       *AALo;
604   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
605   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
606   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
607   const MatScalar                   *aa = b->a,*v;
608 
609   PetscFunctionBegin;
610   if (!n) PetscFunctionReturn(0);
611   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
612     try {
613       PetscCallCUDA(cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar)));
614       PetscCallCUDA(cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar)));
615       if (!upTriFactor && !loTriFactor) {
616         /* Allocate Space for the upper triangular matrix */
617         PetscCallCUDA(cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt)));
618         PetscCallCUDA(cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt)));
619 
620         /* Fill the upper triangular matrix */
621         AiUp[0]=(PetscInt) 0;
622         AiUp[n]=nzUpper;
623         offset = 0;
624         for (i=0; i<n; i++) {
625           /* set the pointers */
626           v  = aa + ai[i];
627           vj = aj + ai[i];
628           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
629 
630           /* first, set the diagonal elements */
631           AjUp[offset] = (PetscInt) i;
632           AAUp[offset] = (MatScalar)1.0/v[nz];
633           AiUp[i]      = offset;
634           AALo[offset] = (MatScalar)1.0/v[nz];
635 
636           offset+=1;
637           if (nz>0) {
638             PetscCall(PetscArraycpy(&(AjUp[offset]), vj, nz));
639             PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz));
640             for (j=offset; j<offset+nz; j++) {
641               AAUp[j] = -AAUp[j];
642               AALo[j] = AAUp[j]/v[nz];
643             }
644             offset+=nz;
645           }
646         }
647 
648         /* allocate space for the triangular factor information */
649         PetscCall(PetscNew(&upTriFactor));
650         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
651 
652         /* Create the matrix description */
653         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
654         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
655        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
656         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
657        #else
658         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
659        #endif
660         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
661         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
662 
663         /* set the matrix */
664         upTriFactor->csrMat = new CsrMatrix;
665         upTriFactor->csrMat->num_rows = A->rmap->n;
666         upTriFactor->csrMat->num_cols = A->cmap->n;
667         upTriFactor->csrMat->num_entries = a->nz;
668 
669         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
670         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
671 
672         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
673         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
674 
675         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
676         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
677 
678         /* set the operation */
679         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
680 
681         /* Create the solve analysis information */
682         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
683         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
684       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
685         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
686                                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
687                                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
688                                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
689                                                &upTriFactor->solveBufferSize));
690         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize));
691       #endif
692 
693         /* perform the solve analysis */
694         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
695                                          upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
696                                          upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
697                                          upTriFactor->csrMat->column_indices->data().get(),
698                                          #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
699                                          upTriFactor->solveInfo,
700                                          upTriFactor->solvePolicy, upTriFactor->solveBuffer));
701                                          #else
702                                          upTriFactor->solveInfo));
703                                          #endif
704         PetscCallCUDA(WaitForCUDA());
705         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
706 
707         /* assign the pointer */
708         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
709 
710         /* allocate space for the triangular factor information */
711         PetscCall(PetscNew(&loTriFactor));
712         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
713 
714         /* Create the matrix description */
715         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
716         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
717        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
718         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
719        #else
720         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
721        #endif
722         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
723         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
724 
725         /* set the operation */
726         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
727 
728         /* set the matrix */
729         loTriFactor->csrMat = new CsrMatrix;
730         loTriFactor->csrMat->num_rows = A->rmap->n;
731         loTriFactor->csrMat->num_cols = A->cmap->n;
732         loTriFactor->csrMat->num_entries = a->nz;
733 
734         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
735         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
736 
737         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
738         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
739 
740         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
741         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
742 
743         /* Create the solve analysis information */
744         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
745         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
746       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
747         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
748                                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
749                                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
750                                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
751                                                &loTriFactor->solveBufferSize));
752         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize));
753       #endif
754 
755         /* perform the solve analysis */
756         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
757                                          loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
758                                          loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
759                                          loTriFactor->csrMat->column_indices->data().get(),
760                                          #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
761                                          loTriFactor->solveInfo,
762                                          loTriFactor->solvePolicy, loTriFactor->solveBuffer));
763                                          #else
764                                          loTriFactor->solveInfo));
765                                          #endif
766         PetscCallCUDA(WaitForCUDA());
767         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
768 
769         /* assign the pointer */
770         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
771 
772         PetscCall(PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar))));
773         PetscCallCUDA(cudaFreeHost(AiUp));
774         PetscCallCUDA(cudaFreeHost(AjUp));
775       } else {
776         /* Fill the upper triangular matrix */
777         offset = 0;
778         for (i=0; i<n; i++) {
779           /* set the pointers */
780           v  = aa + ai[i];
781           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
782 
783           /* first, set the diagonal elements */
784           AAUp[offset] = 1.0/v[nz];
785           AALo[offset] = 1.0/v[nz];
786 
787           offset+=1;
788           if (nz>0) {
789             PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz));
790             for (j=offset; j<offset+nz; j++) {
791               AAUp[j] = -AAUp[j];
792               AALo[j] = AAUp[j]/v[nz];
793             }
794             offset+=nz;
795           }
796         }
797         PetscCheck(upTriFactor,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
798         PetscCheck(loTriFactor,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
799         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
800         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
801         PetscCall(PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar)));
802       }
803       PetscCallCUDA(cudaFreeHost(AAUp));
804       PetscCallCUDA(cudaFreeHost(AALo));
805     } catch(char *ex) {
806       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
807     }
808   }
809   PetscFunctionReturn(0);
810 }
811 
812 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
813 {
814   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
815   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
816   IS                           ip = a->row;
817   PetscBool                    perm_identity;
818   PetscInt                     n = A->rmap->n;
819 
820   PetscFunctionBegin;
821   PetscCheck(cusparseTriFactors,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
822   PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A));
823   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
824   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
825 
826   A->offloadmask = PETSC_OFFLOAD_BOTH;
827 
828   /* lower triangular indices */
829   PetscCall(ISIdentity(ip,&perm_identity));
830   if (!perm_identity) {
831     IS             iip;
832     const PetscInt *irip,*rip;
833 
834     PetscCall(ISInvertPermutation(ip,PETSC_DECIDE,&iip));
835     PetscCall(ISGetIndices(iip,&irip));
836     PetscCall(ISGetIndices(ip,&rip));
837     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
838     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
839     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
840     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
841     PetscCall(ISRestoreIndices(iip,&irip));
842     PetscCall(ISDestroy(&iip));
843     PetscCall(ISRestoreIndices(ip,&rip));
844     PetscCall(PetscLogCpuToGpu(2.*n*sizeof(PetscInt)));
845   }
846   PetscFunctionReturn(0);
847 }
848 
849 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
850 {
851   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
852   IS             ip = b->row;
853   PetscBool      perm_identity;
854 
855   PetscFunctionBegin;
856   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
857   PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B,A,info));
858   B->offloadmask = PETSC_OFFLOAD_CPU;
859   /* determine which version of MatSolve needs to be used. */
860   PetscCall(ISIdentity(ip,&perm_identity));
861   if (perm_identity) {
862     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
863     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
864     B->ops->matsolve = NULL;
865     B->ops->matsolvetranspose = NULL;
866   } else {
867     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
868     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
869     B->ops->matsolve = NULL;
870     B->ops->matsolvetranspose = NULL;
871   }
872 
873   /* get the triangular factors */
874   PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B));
875   PetscFunctionReturn(0);
876 }
877 
878 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
879 {
880   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
881   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
882   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
883   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
884   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
885   cusparseIndexBase_t               indexBase;
886   cusparseMatrixType_t              matrixType;
887   cusparseFillMode_t                fillMode;
888   cusparseDiagType_t                diagType;
889 
890   PetscFunctionBegin;
891   /* allocate space for the transpose of the lower triangular factor */
892   PetscCall(PetscNew(&loTriFactorT));
893   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
894 
895   /* set the matrix descriptors of the lower triangular factor */
896   matrixType = cusparseGetMatType(loTriFactor->descr);
897   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
898   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
899     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
900   diagType = cusparseGetMatDiagType(loTriFactor->descr);
901 
902   /* Create the matrix description */
903   PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr));
904   PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase));
905   PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType));
906   PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode));
907   PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType));
908 
909   /* set the operation */
910   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
911 
912   /* allocate GPU space for the CSC of the lower triangular factor*/
913   loTriFactorT->csrMat = new CsrMatrix;
914   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
915   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
916   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
917   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
918   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
919   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
920 
921   /* compute the transpose of the lower triangular factor, i.e. the CSC */
922 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
923   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
924                                                loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
925                                                loTriFactor->csrMat->values->data().get(),
926                                                loTriFactor->csrMat->row_offsets->data().get(),
927                                                loTriFactor->csrMat->column_indices->data().get(),
928                                                loTriFactorT->csrMat->values->data().get(),
929                                                loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
930                                                CUSPARSE_ACTION_NUMERIC,indexBase,
931                                                CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize));
932   PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize));
933 #endif
934 
935   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
936   PetscCallCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
937                                   loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
938                                   loTriFactor->csrMat->values->data().get(),
939                                   loTriFactor->csrMat->row_offsets->data().get(),
940                                   loTriFactor->csrMat->column_indices->data().get(),
941                                   loTriFactorT->csrMat->values->data().get(),
942                                   #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
943                                   loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
944                                   CUSPARSE_ACTION_NUMERIC, indexBase,
945                                   CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer));
946                                   #else
947                                   loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
948                                   CUSPARSE_ACTION_NUMERIC, indexBase));
949                                   #endif
950   PetscCallCUDA(WaitForCUDA());
951   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
952 
953   /* Create the solve analysis information */
954   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
955   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactorT->solveInfo));
956 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
957   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
958                                          loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
959                                          loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
960                                          loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
961                                          &loTriFactorT->solveBufferSize));
962   PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize));
963 #endif
964 
965   /* perform the solve analysis */
966   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
967                                    loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
968                                    loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
969                                    loTriFactorT->csrMat->column_indices->data().get(),
970                                    #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
971                                    loTriFactorT->solveInfo,
972                                    loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
973                                    #else
974                                    loTriFactorT->solveInfo));
975                                    #endif
976   PetscCallCUDA(WaitForCUDA());
977   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
978 
979   /* assign the pointer */
980   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
981 
982   /*********************************************/
983   /* Now the Transpose of the Upper Tri Factor */
984   /*********************************************/
985 
986   /* allocate space for the transpose of the upper triangular factor */
987   PetscCall(PetscNew(&upTriFactorT));
988   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
989 
990   /* set the matrix descriptors of the upper triangular factor */
991   matrixType = cusparseGetMatType(upTriFactor->descr);
992   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
993   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
994     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
995   diagType = cusparseGetMatDiagType(upTriFactor->descr);
996 
997   /* Create the matrix description */
998   PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr));
999   PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase));
1000   PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType));
1001   PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode));
1002   PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType));
1003 
1004   /* set the operation */
1005   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1006 
1007   /* allocate GPU space for the CSC of the upper triangular factor*/
1008   upTriFactorT->csrMat = new CsrMatrix;
1009   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1010   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1011   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1012   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1013   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1014   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1015 
1016   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1017 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1018   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1019                                                upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1020                                                upTriFactor->csrMat->values->data().get(),
1021                                                upTriFactor->csrMat->row_offsets->data().get(),
1022                                                upTriFactor->csrMat->column_indices->data().get(),
1023                                                upTriFactorT->csrMat->values->data().get(),
1024                                                upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1025                                                CUSPARSE_ACTION_NUMERIC,indexBase,
1026                                                CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize));
1027   PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize));
1028 #endif
1029 
1030   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
1031   PetscCallCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1032                                   upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1033                                   upTriFactor->csrMat->values->data().get(),
1034                                   upTriFactor->csrMat->row_offsets->data().get(),
1035                                   upTriFactor->csrMat->column_indices->data().get(),
1036                                   upTriFactorT->csrMat->values->data().get(),
1037                                   #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1038                                   upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1039                                   CUSPARSE_ACTION_NUMERIC, indexBase,
1040                                   CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer));
1041                                   #else
1042                                   upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1043                                  CUSPARSE_ACTION_NUMERIC, indexBase));
1044                                  #endif
1045 
1046   PetscCallCUDA(WaitForCUDA());
1047   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
1048 
1049   /* Create the solve analysis information */
1050   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
1051   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactorT->solveInfo));
1052   #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1053   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1054                                          upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1055                                          upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1056                                          upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
1057                                          &upTriFactorT->solveBufferSize));
1058   PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize));
1059   #endif
1060 
1061   /* perform the solve analysis */
1062   /* christ, would it have killed you to put this stuff in a function????????? */
1063   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1064                                    upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1065                                    upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1066                                    upTriFactorT->csrMat->column_indices->data().get(),
1067                                    #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1068                                    upTriFactorT->solveInfo,
1069                                    upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1070                                    #else
1071                                    upTriFactorT->solveInfo));
1072                                    #endif
1073 
1074   PetscCallCUDA(WaitForCUDA());
1075   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
1076 
1077   /* assign the pointer */
1078   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1079   PetscFunctionReturn(0);
1080 }
1081 
1082 struct PetscScalarToPetscInt
1083 {
1084   __host__ __device__
1085   PetscInt operator()(PetscScalar s)
1086   {
1087     return (PetscInt)PetscRealPart(s);
1088   }
1089 };
1090 
1091 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A)
1092 {
1093   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1094   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1095   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1096   cusparseStatus_t             stat;
1097   cusparseIndexBase_t          indexBase;
1098 
1099   PetscFunctionBegin;
1100   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1101   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1102   PetscCheck(matstruct,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing mat struct");
1103   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1104   PetscCheck(!A->transupdated || matstructT,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing matTranspose struct");
1105   if (A->transupdated) PetscFunctionReturn(0);
1106   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
1107   PetscCall(PetscLogGpuTimeBegin());
1108   if (cusparsestruct->format != MAT_CUSPARSE_CSR) {
1109     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE));
1110   }
1111   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1112     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1113     PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr));
1114     indexBase = cusparseGetMatIndexBase(matstruct->descr);
1115     PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase));
1116     PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
1117 
1118     /* set alpha and beta */
1119     PetscCallCUDA(cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar)));
1120     PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar)));
1121     PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar)));
1122     PetscCallCUDA(cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
1123     PetscCallCUDA(cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice));
1124     PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
1125 
1126     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1127       CsrMatrix *matrixT = new CsrMatrix;
1128       matstructT->mat = matrixT;
1129       matrixT->num_rows = A->cmap->n;
1130       matrixT->num_cols = A->rmap->n;
1131       matrixT->num_entries = a->nz;
1132       matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1133       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1134       matrixT->values = new THRUSTARRAY(a->nz);
1135 
1136       if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); }
1137       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
1138 
1139      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1140       #if PETSC_PKG_CUDA_VERSION_GE(11,2,1)
1141         stat = cusparseCreateCsr(&matstructT->matDescr,
1142                                matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1143                                matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1144                                matrixT->values->data().get(),
1145                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1146                                indexBase,cusparse_scalartype);PetscCallCUSPARSE(stat);
1147       #else
1148         /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1,
1149            see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1
1150 
1151            I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set
1152            it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2,
1153            when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly.
1154         */
1155         if (matrixT->num_entries) {
1156           stat = cusparseCreateCsr(&matstructT->matDescr,
1157                                  matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1158                                  matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1159                                  matrixT->values->data().get(),
1160                                  CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I,
1161                                  indexBase,cusparse_scalartype);PetscCallCUSPARSE(stat);
1162 
1163         } else {
1164           matstructT->matDescr = NULL;
1165           matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
1166         }
1167       #endif
1168      #endif
1169     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1170    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1171       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1172    #else
1173       CsrMatrix *temp  = new CsrMatrix;
1174       CsrMatrix *tempT = new CsrMatrix;
1175       /* First convert HYB to CSR */
1176       temp->num_rows = A->rmap->n;
1177       temp->num_cols = A->cmap->n;
1178       temp->num_entries = a->nz;
1179       temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1180       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1181       temp->values = new THRUSTARRAY(a->nz);
1182 
1183       stat = cusparse_hyb2csr(cusparsestruct->handle,
1184                               matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1185                               temp->values->data().get(),
1186                               temp->row_offsets->data().get(),
1187                               temp->column_indices->data().get());PetscCallCUSPARSE(stat);
1188 
1189       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1190       tempT->num_rows = A->rmap->n;
1191       tempT->num_cols = A->cmap->n;
1192       tempT->num_entries = a->nz;
1193       tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1194       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1195       tempT->values = new THRUSTARRAY(a->nz);
1196 
1197       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1198                               temp->num_cols, temp->num_entries,
1199                               temp->values->data().get(),
1200                               temp->row_offsets->data().get(),
1201                               temp->column_indices->data().get(),
1202                               tempT->values->data().get(),
1203                               tempT->column_indices->data().get(),
1204                               tempT->row_offsets->data().get(),
1205                               CUSPARSE_ACTION_NUMERIC, indexBase);PetscCallCUSPARSE(stat);
1206 
1207       /* Last, convert CSC to HYB */
1208       cusparseHybMat_t hybMat;
1209       PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
1210       cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1211         CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1212       stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1213                               matstructT->descr, tempT->values->data().get(),
1214                               tempT->row_offsets->data().get(),
1215                               tempT->column_indices->data().get(),
1216                               hybMat, 0, partition);PetscCallCUSPARSE(stat);
1217 
1218       /* assign the pointer */
1219       matstructT->mat = hybMat;
1220       A->transupdated = PETSC_TRUE;
1221       /* delete temporaries */
1222       if (tempT) {
1223         if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1224         if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1225         if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1226         delete (CsrMatrix*) tempT;
1227       }
1228       if (temp) {
1229         if (temp->values) delete (THRUSTARRAY*) temp->values;
1230         if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1231         if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1232         delete (CsrMatrix*) temp;
1233       }
1234      #endif
1235     }
1236   }
1237   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1238     CsrMatrix *matrix  = (CsrMatrix*)matstruct->mat;
1239     CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat;
1240     PetscCheck(matrix,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix");
1241     PetscCheck(matrix->row_offsets,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix rows");
1242     PetscCheck(matrix->column_indices,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix cols");
1243     PetscCheck(matrix->values,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix values");
1244     PetscCheck(matrixT,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT");
1245     PetscCheck(matrixT->row_offsets,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT rows");
1246     PetscCheck(matrixT->column_indices,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT cols");
1247     PetscCheck(matrixT->values,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT values");
1248     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1249       cusparsestruct->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
1250       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
1251       PetscCall(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt)));
1252     }
1253     if (!cusparsestruct->csr2csc_i) {
1254       THRUSTARRAY csr2csc_a(matrix->num_entries);
1255       PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1256 
1257       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1258      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1259       void   *csr2cscBuffer;
1260       size_t csr2cscBufferSize;
1261       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1262                                            A->cmap->n, matrix->num_entries,
1263                                            matrix->values->data().get(),
1264                                            cusparsestruct->rowoffsets_gpu->data().get(),
1265                                            matrix->column_indices->data().get(),
1266                                            matrixT->values->data().get(),
1267                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1268                                            CUSPARSE_ACTION_NUMERIC,indexBase,
1269                                            cusparsestruct->csr2cscAlg, &csr2cscBufferSize);PetscCallCUSPARSE(stat);
1270       PetscCallCUDA(cudaMalloc(&csr2cscBuffer,csr2cscBufferSize));
1271      #endif
1272 
1273       if (matrix->num_entries) {
1274         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
1275            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
1276            I checked every parameters and they were just fine. I have no clue why cusparse complains.
1277 
1278            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
1279            should be filled with indexBase. So I just take a shortcut here.
1280         */
1281         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1282                               A->cmap->n,matrix->num_entries,
1283                               csr2csc_a.data().get(),
1284                               cusparsestruct->rowoffsets_gpu->data().get(),
1285                               matrix->column_indices->data().get(),
1286                               matrixT->values->data().get(),
1287                              #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1288                               matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1289                               CUSPARSE_ACTION_NUMERIC,indexBase,
1290                               cusparsestruct->csr2cscAlg, csr2cscBuffer);PetscCallCUSPARSE(stat);
1291                              #else
1292                               matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1293                               CUSPARSE_ACTION_NUMERIC, indexBase);PetscCallCUSPARSE(stat);
1294                              #endif
1295       } else {
1296         matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
1297       }
1298 
1299       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1300       PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt()));
1301      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1302       PetscCallCUDA(cudaFree(csr2cscBuffer));
1303      #endif
1304     }
1305     PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()),
1306                                                      thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()),
1307                                                      matrixT->values->begin()));
1308   }
1309   PetscCall(PetscLogGpuTimeEnd());
1310   PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
1311   /* the compressed row indices is not used for matTranspose */
1312   matstructT->cprowIndices = NULL;
1313   /* assign the pointer */
1314   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1315   A->transupdated = PETSC_TRUE;
1316   PetscFunctionReturn(0);
1317 }
1318 
1319 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1320 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1321 {
1322   PetscInt                              n = xx->map->n;
1323   const PetscScalar                     *barray;
1324   PetscScalar                           *xarray;
1325   thrust::device_ptr<const PetscScalar> bGPU;
1326   thrust::device_ptr<PetscScalar>       xGPU;
1327   cusparseStatus_t                      stat;
1328   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1329   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1330   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1331   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1332 
1333   PetscFunctionBegin;
1334   /* Analyze the matrix and create the transpose ... on the fly */
1335   if (!loTriFactorT && !upTriFactorT) {
1336     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1337     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1338     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1339   }
1340 
1341   /* Get the GPU pointers */
1342   PetscCall(VecCUDAGetArrayWrite(xx,&xarray));
1343   PetscCall(VecCUDAGetArrayRead(bb,&barray));
1344   xGPU = thrust::device_pointer_cast(xarray);
1345   bGPU = thrust::device_pointer_cast(barray);
1346 
1347   PetscCall(PetscLogGpuTimeBegin());
1348   /* First, reorder with the row permutation */
1349   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1350                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1351                xGPU);
1352 
1353   /* First, solve U */
1354   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1355                         upTriFactorT->csrMat->num_rows,
1356                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1357                         upTriFactorT->csrMat->num_entries,
1358                       #endif
1359                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1360                         upTriFactorT->csrMat->values->data().get(),
1361                         upTriFactorT->csrMat->row_offsets->data().get(),
1362                         upTriFactorT->csrMat->column_indices->data().get(),
1363                         upTriFactorT->solveInfo,
1364                         xarray,
1365                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1366                         tempGPU->data().get(),
1367                         upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);PetscCallCUSPARSE(stat);
1368                       #else
1369                         tempGPU->data().get());PetscCallCUSPARSE(stat);
1370                       #endif
1371 
1372   /* Then, solve L */
1373   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1374                         loTriFactorT->csrMat->num_rows,
1375                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1376                         loTriFactorT->csrMat->num_entries,
1377                       #endif
1378                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1379                         loTriFactorT->csrMat->values->data().get(),
1380                         loTriFactorT->csrMat->row_offsets->data().get(),
1381                         loTriFactorT->csrMat->column_indices->data().get(),
1382                         loTriFactorT->solveInfo,
1383                         tempGPU->data().get(),
1384                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1385                         xarray,
1386                         loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);PetscCallCUSPARSE(stat);
1387                       #else
1388                          xarray);PetscCallCUSPARSE(stat);
1389                       #endif
1390 
1391   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1392   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1393                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1394                tempGPU->begin());
1395 
1396   /* Copy the temporary to the full solution. */
1397   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU);
1398 
1399   /* restore */
1400   PetscCall(VecCUDARestoreArrayRead(bb,&barray));
1401   PetscCall(VecCUDARestoreArrayWrite(xx,&xarray));
1402   PetscCall(PetscLogGpuTimeEnd());
1403   PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n));
1404   PetscFunctionReturn(0);
1405 }
1406 
1407 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1408 {
1409   const PetscScalar                 *barray;
1410   PetscScalar                       *xarray;
1411   cusparseStatus_t                  stat;
1412   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1413   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1414   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1415   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1416 
1417   PetscFunctionBegin;
1418   /* Analyze the matrix and create the transpose ... on the fly */
1419   if (!loTriFactorT && !upTriFactorT) {
1420     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1421     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1422     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1423   }
1424 
1425   /* Get the GPU pointers */
1426   PetscCall(VecCUDAGetArrayWrite(xx,&xarray));
1427   PetscCall(VecCUDAGetArrayRead(bb,&barray));
1428 
1429   PetscCall(PetscLogGpuTimeBegin());
1430   /* First, solve U */
1431   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1432                         upTriFactorT->csrMat->num_rows,
1433                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1434                         upTriFactorT->csrMat->num_entries,
1435                       #endif
1436                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1437                         upTriFactorT->csrMat->values->data().get(),
1438                         upTriFactorT->csrMat->row_offsets->data().get(),
1439                         upTriFactorT->csrMat->column_indices->data().get(),
1440                         upTriFactorT->solveInfo,
1441                         barray,
1442                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1443                         tempGPU->data().get(),
1444                         upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);PetscCallCUSPARSE(stat);
1445                       #else
1446                         tempGPU->data().get());PetscCallCUSPARSE(stat);
1447                       #endif
1448 
1449   /* Then, solve L */
1450   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1451                         loTriFactorT->csrMat->num_rows,
1452                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1453                         loTriFactorT->csrMat->num_entries,
1454                       #endif
1455                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1456                         loTriFactorT->csrMat->values->data().get(),
1457                         loTriFactorT->csrMat->row_offsets->data().get(),
1458                         loTriFactorT->csrMat->column_indices->data().get(),
1459                         loTriFactorT->solveInfo,
1460                         tempGPU->data().get(),
1461                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1462                         xarray,
1463                         loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);PetscCallCUSPARSE(stat);
1464                       #else
1465                         xarray);PetscCallCUSPARSE(stat);
1466                       #endif
1467 
1468   /* restore */
1469   PetscCall(VecCUDARestoreArrayRead(bb,&barray));
1470   PetscCall(VecCUDARestoreArrayWrite(xx,&xarray));
1471   PetscCall(PetscLogGpuTimeEnd());
1472   PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n));
1473   PetscFunctionReturn(0);
1474 }
1475 
1476 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1477 {
1478   const PetscScalar                     *barray;
1479   PetscScalar                           *xarray;
1480   thrust::device_ptr<const PetscScalar> bGPU;
1481   thrust::device_ptr<PetscScalar>       xGPU;
1482   cusparseStatus_t                      stat;
1483   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1484   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1485   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1486   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1487 
1488   PetscFunctionBegin;
1489 
1490   /* Get the GPU pointers */
1491   PetscCall(VecCUDAGetArrayWrite(xx,&xarray));
1492   PetscCall(VecCUDAGetArrayRead(bb,&barray));
1493   xGPU = thrust::device_pointer_cast(xarray);
1494   bGPU = thrust::device_pointer_cast(barray);
1495 
1496   PetscCall(PetscLogGpuTimeBegin());
1497   /* First, reorder with the row permutation */
1498   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1499                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1500                tempGPU->begin());
1501 
1502   /* Next, solve L */
1503   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1504                         loTriFactor->csrMat->num_rows,
1505                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1506                         loTriFactor->csrMat->num_entries,
1507                       #endif
1508                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1509                         loTriFactor->csrMat->values->data().get(),
1510                         loTriFactor->csrMat->row_offsets->data().get(),
1511                         loTriFactor->csrMat->column_indices->data().get(),
1512                         loTriFactor->solveInfo,
1513                         tempGPU->data().get(),
1514                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1515                          xarray,
1516                          loTriFactor->solvePolicy, loTriFactor->solveBuffer);PetscCallCUSPARSE(stat);
1517                       #else
1518                          xarray);PetscCallCUSPARSE(stat);
1519                       #endif
1520 
1521   /* Then, solve U */
1522   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1523                         upTriFactor->csrMat->num_rows,
1524                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1525                         upTriFactor->csrMat->num_entries,
1526                       #endif
1527                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1528                         upTriFactor->csrMat->values->data().get(),
1529                         upTriFactor->csrMat->row_offsets->data().get(),
1530                         upTriFactor->csrMat->column_indices->data().get(),
1531                         upTriFactor->solveInfo,xarray,
1532                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1533                         tempGPU->data().get(),
1534                         upTriFactor->solvePolicy, upTriFactor->solveBuffer);PetscCallCUSPARSE(stat);
1535                       #else
1536                         tempGPU->data().get());PetscCallCUSPARSE(stat);
1537                       #endif
1538 
1539   /* Last, reorder with the column permutation */
1540   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1541                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
1542                xGPU);
1543 
1544   PetscCall(VecCUDARestoreArrayRead(bb,&barray));
1545   PetscCall(VecCUDARestoreArrayWrite(xx,&xarray));
1546   PetscCall(PetscLogGpuTimeEnd());
1547   PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n));
1548   PetscFunctionReturn(0);
1549 }
1550 
1551 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1552 {
1553   const PetscScalar                 *barray;
1554   PetscScalar                       *xarray;
1555   cusparseStatus_t                  stat;
1556   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1557   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1558   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1559   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1560 
1561   PetscFunctionBegin;
1562   /* Get the GPU pointers */
1563   PetscCall(VecCUDAGetArrayWrite(xx,&xarray));
1564   PetscCall(VecCUDAGetArrayRead(bb,&barray));
1565 
1566   PetscCall(PetscLogGpuTimeBegin());
1567   /* First, solve L */
1568   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1569                         loTriFactor->csrMat->num_rows,
1570                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1571                         loTriFactor->csrMat->num_entries,
1572                       #endif
1573                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1574                         loTriFactor->csrMat->values->data().get(),
1575                         loTriFactor->csrMat->row_offsets->data().get(),
1576                         loTriFactor->csrMat->column_indices->data().get(),
1577                         loTriFactor->solveInfo,
1578                         barray,
1579                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1580                         tempGPU->data().get(),
1581                         loTriFactor->solvePolicy,loTriFactor->solveBuffer);PetscCallCUSPARSE(stat);
1582                       #else
1583                         tempGPU->data().get());PetscCallCUSPARSE(stat);
1584                       #endif
1585 
1586   /* Next, solve U */
1587   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1588                         upTriFactor->csrMat->num_rows,
1589                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1590                         upTriFactor->csrMat->num_entries,
1591                       #endif
1592                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1593                         upTriFactor->csrMat->values->data().get(),
1594                         upTriFactor->csrMat->row_offsets->data().get(),
1595                         upTriFactor->csrMat->column_indices->data().get(),
1596                         upTriFactor->solveInfo,
1597                         tempGPU->data().get(),
1598                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1599                         xarray,
1600                         upTriFactor->solvePolicy, upTriFactor->solveBuffer);PetscCallCUSPARSE(stat);
1601                       #else
1602                         xarray);PetscCallCUSPARSE(stat);
1603                       #endif
1604 
1605   PetscCall(VecCUDARestoreArrayRead(bb,&barray));
1606   PetscCall(VecCUDARestoreArrayWrite(xx,&xarray));
1607   PetscCall(PetscLogGpuTimeEnd());
1608   PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n));
1609   PetscFunctionReturn(0);
1610 }
1611 
1612 #if CUSPARSE_VERSION >= 11500
1613 /* cusparseSpSV_solve() and friends first appeared in cusparse-11.3 */
1614 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ILU0(Mat fact,Vec b,Vec x)
1615 {
1616   Mat_SeqAIJCUSPARSETriFactors  *fs = (Mat_SeqAIJCUSPARSETriFactors*)fact->spptr;
1617   Mat_SeqAIJ                    *aij = (Mat_SeqAIJ*)fact->data;
1618   const PetscScalar             *barray;
1619   PetscScalar                   *xarray;
1620 
1621   PetscFunctionBegin;
1622   PetscCall(VecCUDAGetArrayWrite(x,&xarray));
1623   PetscCall(VecCUDAGetArrayRead(b,&barray));
1624   PetscCall(PetscLogGpuTimeBegin());
1625 
1626   /* Solve L*y = b */
1627   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X,(void*)barray));
1628   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y,fs->Y));
1629   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle,
1630                     CUSPARSE_OPERATION_NON_TRANSPOSE,
1631                     &PETSC_CUSPARSE_ONE,
1632                     fs->spMatDescr_L, /* L Y = X */
1633                     fs->dnVecDescr_X,
1634                     fs->dnVecDescr_Y,
1635                     cusparse_scalartype,
1636                     CUSPARSE_SPSV_ALG_DEFAULT,
1637                     fs->spsvDescr_L));
1638 
1639   /* Solve U*x = y */
1640   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X,xarray));
1641   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle,
1642                     CUSPARSE_OPERATION_NON_TRANSPOSE,
1643                     &PETSC_CUSPARSE_ONE,
1644                     fs->spMatDescr_U, /* U X = Y */
1645                     fs->dnVecDescr_Y,
1646                     fs->dnVecDescr_X,
1647                     cusparse_scalartype,
1648                     CUSPARSE_SPSV_ALG_DEFAULT,
1649                     fs->spsvDescr_U));
1650 
1651   PetscCall(VecCUDARestoreArrayRead(b,&barray));
1652   PetscCall(VecCUDARestoreArrayWrite(x,&xarray));
1653 
1654   PetscCall(PetscLogGpuTimeEnd());
1655   PetscCall(PetscLogGpuFlops(2.0*aij->nz - fact->rmap->n));
1656   PetscFunctionReturn(0);
1657 }
1658 
1659 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_ILU0(Mat fact,Vec b,Vec x)
1660 {
1661   Mat_SeqAIJCUSPARSETriFactors  *fs = (Mat_SeqAIJCUSPARSETriFactors*)fact->spptr;
1662   Mat_SeqAIJ                    *aij = (Mat_SeqAIJ*)fact->data;
1663   const PetscScalar             *barray;
1664   PetscScalar                   *xarray;
1665 
1666   PetscFunctionBegin;
1667   if (!fs->builtSolveTranspose) { /* Call MatSolveTranspose() for the first time */
1668     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
1669     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle,
1670                       CUSPARSE_OPERATION_TRANSPOSE,
1671                       &PETSC_CUSPARSE_ONE,
1672                       fs->spMatDescr_L, /* The matrix is still L. We only do tranpose solve with it */
1673                       fs->dnVecDescr_X,
1674                       fs->dnVecDescr_Y,
1675                       cusparse_scalartype,
1676                       CUSPARSE_SPSV_ALG_DEFAULT,
1677                       fs->spsvDescr_Lt,
1678                       &fs->spsvBufferSize_Lt));
1679 
1680     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut));
1681     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle,
1682                       CUSPARSE_OPERATION_TRANSPOSE,
1683                       &PETSC_CUSPARSE_ONE,
1684                       fs->spMatDescr_U,
1685                       fs->dnVecDescr_X,
1686                       fs->dnVecDescr_Y,
1687                       cusparse_scalartype,
1688                       CUSPARSE_SPSV_ALG_DEFAULT,
1689                       fs->spsvDescr_Ut,
1690                       &fs->spsvBufferSize_Ut));
1691 
1692     PetscCallCUDA(cudaMalloc((void**)&fs->spsvBuffer_Ut,fs->spsvBufferSize_Ut));
1693     PetscCallCUDA(cudaMalloc((void**)&fs->spsvBuffer_Lt,fs->spsvBufferSize_Lt));
1694 
1695     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle,
1696                     CUSPARSE_OPERATION_TRANSPOSE,
1697                     &PETSC_CUSPARSE_ONE,
1698                     fs->spMatDescr_L,
1699                     fs->dnVecDescr_X,
1700                     fs->dnVecDescr_Y,
1701                     cusparse_scalartype,
1702                     CUSPARSE_SPSV_ALG_DEFAULT,
1703                     fs->spsvDescr_Lt,
1704                     fs->spsvBuffer_Lt));
1705 
1706     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle,
1707                     CUSPARSE_OPERATION_TRANSPOSE,
1708                     &PETSC_CUSPARSE_ONE,
1709                     fs->spMatDescr_U,
1710                     fs->dnVecDescr_X,
1711                     fs->dnVecDescr_Y,
1712                     cusparse_scalartype,
1713                     CUSPARSE_SPSV_ALG_DEFAULT,
1714                     fs->spsvDescr_Ut,
1715                     fs->spsvBuffer_Ut));
1716     fs->builtSolveTranspose = PETSC_TRUE;
1717   }
1718 
1719   PetscCall(VecCUDAGetArrayWrite(x,&xarray));
1720   PetscCall(VecCUDAGetArrayRead(b,&barray));
1721   PetscCall(PetscLogGpuTimeBegin());
1722 
1723   /* Solve Ut*y = b */
1724   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X,(void*)barray));
1725   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y,fs->Y));
1726   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle,
1727                     CUSPARSE_OPERATION_TRANSPOSE,
1728                     &PETSC_CUSPARSE_ONE,
1729                     fs->spMatDescr_U, /* Ut Y = X */
1730                     fs->dnVecDescr_X,
1731                     fs->dnVecDescr_Y,
1732                     cusparse_scalartype,
1733                     CUSPARSE_SPSV_ALG_DEFAULT,
1734                     fs->spsvDescr_Ut));
1735 
1736   /* Solve Lt*x = y */
1737   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X,xarray));
1738   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle,
1739                     CUSPARSE_OPERATION_TRANSPOSE,
1740                     &PETSC_CUSPARSE_ONE,
1741                     fs->spMatDescr_L, /* Lt X = Y */
1742                     fs->dnVecDescr_Y,
1743                     fs->dnVecDescr_X,
1744                     cusparse_scalartype,
1745                     CUSPARSE_SPSV_ALG_DEFAULT,
1746                     fs->spsvDescr_Lt));
1747 
1748   PetscCall(VecCUDARestoreArrayRead(b,&barray));
1749   PetscCall(VecCUDARestoreArrayWrite(x,&xarray));
1750   PetscCall(PetscLogGpuTimeEnd());
1751   PetscCall(PetscLogGpuFlops(2.0*aij->nz - fact->rmap->n));
1752   PetscFunctionReturn(0);
1753 }
1754 
1755 static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact,Mat A,const MatFactorInfo *info)
1756 {
1757   Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors*)fact->spptr;
1758   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ*)fact->data;
1759   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1760   CsrMatrix                    *Acsr;
1761   PetscInt                     m,nz;
1762   PetscBool                    flg;
1763 
1764   PetscFunctionBegin;
1765   if (PetscDefined(USE_DEBUG)) {
1766     PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
1767     PetscCheck(flg,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Expected MATSEQAIJCUSPARSE, but input is %s",((PetscObject)A)->type_name);
1768   }
1769 
1770   /* Copy A's value to fact */
1771   m  = fact->rmap->n;
1772   nz = aij->nz;
1773   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1774   Acsr = (CsrMatrix*)Acusp->mat->mat;
1775   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal,Acsr->values->data().get(),sizeof(PetscScalar)*nz,cudaMemcpyDeviceToDevice,PetscDefaultCudaStream));
1776 
1777   /* Factorize fact inplace */
1778   if (m) PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1779                     fs->matDescr_M,
1780                     fs->csrVal,
1781                     fs->csrRowPtr,
1782                     fs->csrColIdx,
1783                     fs->ilu0Info_M,
1784                     fs->policy_M,
1785                     fs->factBuffer_M));
1786   if (PetscDefined(USE_DEBUG)) {
1787     int              numerical_zero;
1788     cusparseStatus_t status;
1789     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero);
1790     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status,PETSC_COMM_SELF,PETSC_ERR_USER_INPUT,"Numerical zero pivot detected in csrilu02: A(%d,%d) is zero",numerical_zero,numerical_zero);
1791   }
1792 
1793   /* From my experiment, cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02() */
1794   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle,
1795                     CUSPARSE_OPERATION_NON_TRANSPOSE,
1796                     &PETSC_CUSPARSE_ONE,
1797                     fs->spMatDescr_L,
1798                     fs->dnVecDescr_X,
1799                     fs->dnVecDescr_Y,
1800                     cusparse_scalartype,
1801                     CUSPARSE_SPSV_ALG_DEFAULT,
1802                     fs->spsvDescr_L,
1803                     fs->spsvBuffer_L));
1804 
1805   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle,
1806                     CUSPARSE_OPERATION_NON_TRANSPOSE,
1807                     &PETSC_CUSPARSE_ONE,
1808                     fs->spMatDescr_U,
1809                     fs->dnVecDescr_X,
1810                     fs->dnVecDescr_Y,
1811                     cusparse_scalartype,
1812                     CUSPARSE_SPSV_ALG_DEFAULT,
1813                     fs->spsvDescr_U,
1814                     fs->spsvBuffer_U));
1815 
1816   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1817   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_ILU0;
1818   fact->ops->solvetranspose    = MatSolveTranspose_SeqAIJCUSPARSE_ILU0;
1819   fact->ops->matsolve          = NULL;
1820   fact->ops->matsolvetranspose = NULL;
1821   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1822   PetscFunctionReturn(0);
1823 }
1824 
1825 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1826 {
1827   Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors*)fact->spptr;
1828   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ*)fact->data;
1829   PetscInt                     m,nz;
1830 
1831   PetscFunctionBegin;
1832   if (PetscDefined(USE_DEBUG)) {
1833     PetscInt  i;
1834     PetscBool flg,missing;
1835 
1836     PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
1837     PetscCheck(flg,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Expected MATSEQAIJCUSPARSE, but input is %s",((PetscObject)A)->type_name);
1838     PetscCheck(A->rmap->n == A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT,A->rmap->n,A->cmap->n);
1839     PetscCall(MatMissingDiagonal(A,&missing,&i));
1840     PetscCheck(!missing,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %" PetscInt_FMT,i);
1841   }
1842 
1843   /* Free the old stale stuff */
1844   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
1845 
1846   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1847      but they will not be used. Allocate them just for easy debugging.
1848    */
1849   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE/*malloc*/));
1850 
1851   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
1852   fact->factortype             = MAT_FACTOR_ILU;
1853   fact->info.factor_mallocs    = 0;
1854   fact->info.fill_ratio_given  = info->fill;
1855   fact->info.fill_ratio_needed = 1.0;
1856 
1857   aij->row = NULL;
1858   aij->col = NULL;
1859 
1860   /* ====================================================================== */
1861   /* Copy A's i, j to fact and also allocate the value array of fact.       */
1862   /* We'll do in-place factorization on fact                                */
1863   /* ====================================================================== */
1864   const int *Ai,*Aj;
1865 
1866   m  = fact->rmap->n;
1867   nz = aij->nz;
1868 
1869   PetscCallCUDA(cudaMalloc((void**)&fs->csrRowPtr,sizeof(int)*(m+1)));
1870   PetscCallCUDA(cudaMalloc((void**)&fs->csrColIdx,sizeof(int)*nz));
1871   PetscCallCUDA(cudaMalloc((void**)&fs->csrVal,sizeof(PetscScalar)*nz));
1872   PetscCall(MatSeqAIJCUSPARSEGetIJ(A,PETSC_FALSE,&Ai,&Aj));  /* Do not use compressed Ai */
1873   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr,Ai,sizeof(int)*(m+1),cudaMemcpyDeviceToDevice,PetscDefaultCudaStream));
1874   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx,Aj,sizeof(int)*nz,cudaMemcpyDeviceToDevice,PetscDefaultCudaStream));
1875 
1876   /* ====================================================================== */
1877   /* Create descriptors for M, L, U                                         */
1878   /* ====================================================================== */
1879   cusparseFillMode_t fillMode;
1880   cusparseDiagType_t diagType;
1881 
1882   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
1883   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
1884   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
1885 
1886   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
1887     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1888     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1889     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1890     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1891   */
1892   fillMode = CUSPARSE_FILL_MODE_LOWER;
1893   diagType = CUSPARSE_DIAG_TYPE_UNIT;
1894   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L,m,m,nz,
1895                     fs->csrRowPtr,
1896                     fs->csrColIdx,
1897                     fs->csrVal,
1898                     CUSPARSE_INDEX_32I,
1899                     CUSPARSE_INDEX_32I,
1900                     CUSPARSE_INDEX_BASE_ZERO,
1901                     cusparse_scalartype));
1902   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L,
1903                     CUSPARSE_SPMAT_FILL_MODE,
1904                     &fillMode,
1905                     sizeof(fillMode)));
1906   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L,
1907                     CUSPARSE_SPMAT_DIAG_TYPE,
1908                     &diagType,
1909                     sizeof(diagType)));
1910 
1911   fillMode = CUSPARSE_FILL_MODE_UPPER;
1912   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
1913   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U,m,m,nz,
1914                     fs->csrRowPtr,
1915                     fs->csrColIdx,
1916                     fs->csrVal,
1917                     CUSPARSE_INDEX_32I,
1918                     CUSPARSE_INDEX_32I,
1919                     CUSPARSE_INDEX_BASE_ZERO,
1920                     cusparse_scalartype));
1921   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U,
1922                     CUSPARSE_SPMAT_FILL_MODE,
1923                     &fillMode,
1924                     sizeof(fillMode)));
1925   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U,
1926                     CUSPARSE_SPMAT_DIAG_TYPE,
1927                     &diagType,
1928                     sizeof(diagType)));
1929 
1930   /* ========================================================================= */
1931   /* Query buffer sizes for csrilu0, SpSV and allocate buffers                 */
1932   /* ========================================================================= */
1933   PetscCallCUSPARSE(cusparseCreateCsrilu02Info(&fs->ilu0Info_M));
1934   if (m) PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1935                     fs->matDescr_M,
1936                     fs->csrVal,
1937                     fs->csrRowPtr,
1938                     fs->csrColIdx,
1939                     fs->ilu0Info_M,
1940                     &fs->factBufferSize_M));
1941 
1942   PetscCallCUDA(cudaMalloc((void**)&fs->X,sizeof(PetscScalar)*m));
1943   PetscCallCUDA(cudaMalloc((void**)&fs->Y,sizeof(PetscScalar)*m));
1944 
1945   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X,m,fs->X,cusparse_scalartype));
1946   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y,m,fs->Y,cusparse_scalartype));
1947 
1948   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
1949   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle,
1950                     CUSPARSE_OPERATION_NON_TRANSPOSE,
1951                     &PETSC_CUSPARSE_ONE,
1952                     fs->spMatDescr_L,
1953                     fs->dnVecDescr_X,
1954                     fs->dnVecDescr_Y,
1955                     cusparse_scalartype,
1956                     CUSPARSE_SPSV_ALG_DEFAULT,
1957                     fs->spsvDescr_L,
1958                     &fs->spsvBufferSize_L));
1959 
1960   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
1961   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle,
1962                     CUSPARSE_OPERATION_NON_TRANSPOSE,
1963                     &PETSC_CUSPARSE_ONE,
1964                     fs->spMatDescr_U,
1965                     fs->dnVecDescr_X,
1966                     fs->dnVecDescr_Y,
1967                     cusparse_scalartype,
1968                     CUSPARSE_SPSV_ALG_DEFAULT,
1969                     fs->spsvDescr_U,
1970                     &fs->spsvBufferSize_U));
1971 
1972   /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab,
1973      spsvBuffer_L and spsvBuffer_U can not be shared.
1974    */
1975   PetscCallCUDA(cudaMalloc((void**)&fs->spsvBuffer_U,fs->spsvBufferSize_U));
1976   PetscCallCUDA(cudaMalloc((void**)&fs->spsvBuffer_L,fs->spsvBufferSize_L));
1977   PetscCallCUDA(cudaMalloc((void**)&fs->factBuffer_M,fs->factBufferSize_M));
1978 
1979   /* ========================================================================== */
1980   /* Perform analysis of ilu0 on M, SpSv on L and U                             */
1981   /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/
1982   /* ========================================================================== */
1983   int              structural_zero;
1984   cusparseStatus_t status;
1985 
1986   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1987   if (m) PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1988                     fs->matDescr_M,
1989                     fs->csrVal,
1990                     fs->csrRowPtr,
1991                     fs->csrColIdx,
1992                     fs->ilu0Info_M,
1993                     fs->policy_M,
1994                     fs->factBuffer_M));
1995   if (PetscDefined(USE_DEBUG)) {
1996     /* Function cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
1997     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero);
1998     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status,PETSC_COMM_SELF,PETSC_ERR_USER_INPUT,"Structural zero pivot detected in csrilu02: A(%d,%d) is missing",structural_zero,structural_zero);
1999   }
2000 
2001   /* Estimate FLOPs of the numeric factorization */
2002   Mat_SeqAIJ     *Aseq = (Mat_SeqAIJ*)A->data;
2003   PetscInt       *Adiag,nzRow,nzLeft;
2004   PetscLogDouble flops = 0.0;
2005 
2006   PetscCall(MatMarkDiagonal_SeqAIJ(A));
2007   Ai    = Aseq->i;
2008   Adiag = Aseq->diag;
2009   for (PetscInt i=0; i<m; i++) {
2010     if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i+1]) { /* There are nonzeros left to the diagonal of row i */
2011       nzRow  = Ai[i+1] - Ai[i];
2012       nzLeft = Adiag[i] - Ai[i];
2013       /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
2014          and include the eliminated one will be updated, which incurs a multiplication and an addition.
2015       */
2016       nzLeft = (nzRow-1)/2;
2017       flops += nzLeft*(2.0*nzRow-nzLeft+1);
2018     }
2019   }
2020   fs->numericFactFlops = flops;
2021   fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0;
2022   PetscFunctionReturn(0);
2023 }
2024 
2025 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact,Vec b,Vec x)
2026 {
2027   Mat_SeqAIJCUSPARSETriFactors  *fs = (Mat_SeqAIJCUSPARSETriFactors*)fact->spptr;
2028   Mat_SeqAIJ                    *aij = (Mat_SeqAIJ*)fact->data;
2029   const PetscScalar             *barray;
2030   PetscScalar                   *xarray;
2031 
2032   PetscFunctionBegin;
2033   PetscCall(VecCUDAGetArrayWrite(x,&xarray));
2034   PetscCall(VecCUDAGetArrayRead(b,&barray));
2035   PetscCall(PetscLogGpuTimeBegin());
2036 
2037   /* Solve L*y = b */
2038   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X,(void*)barray));
2039   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y,fs->Y));
2040   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle,
2041                     CUSPARSE_OPERATION_NON_TRANSPOSE,
2042                     &PETSC_CUSPARSE_ONE,
2043                     fs->spMatDescr_L, /* L Y = X */
2044                     fs->dnVecDescr_X,
2045                     fs->dnVecDescr_Y,
2046                     cusparse_scalartype,
2047                     CUSPARSE_SPSV_ALG_DEFAULT,
2048                     fs->spsvDescr_L));
2049 
2050   /* Solve Lt*x = y */
2051   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X,xarray));
2052   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle,
2053                     CUSPARSE_OPERATION_TRANSPOSE,
2054                     &PETSC_CUSPARSE_ONE,
2055                     fs->spMatDescr_L, /* Lt X = Y */
2056                     fs->dnVecDescr_Y,
2057                     fs->dnVecDescr_X,
2058                     cusparse_scalartype,
2059                     CUSPARSE_SPSV_ALG_DEFAULT,
2060                     fs->spsvDescr_Lt));
2061 
2062   PetscCall(VecCUDARestoreArrayRead(b,&barray));
2063   PetscCall(VecCUDARestoreArrayWrite(x,&xarray));
2064 
2065   PetscCall(PetscLogGpuTimeEnd());
2066   PetscCall(PetscLogGpuFlops(2.0*aij->nz - fact->rmap->n));
2067   PetscFunctionReturn(0);
2068 }
2069 
2070 static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact,Mat A,const MatFactorInfo *info)
2071 {
2072   Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors*)fact->spptr;
2073   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ*)fact->data;
2074   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2075   CsrMatrix                    *Acsr;
2076   PetscInt                     m,nz;
2077   PetscBool                    flg;
2078 
2079   PetscFunctionBegin;
2080   if (PetscDefined(USE_DEBUG)) {
2081     PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
2082     PetscCheck(flg,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Expected MATSEQAIJCUSPARSE, but input is %s",((PetscObject)A)->type_name);
2083   }
2084 
2085   /* Copy A's value to fact */
2086   m  = fact->rmap->n;
2087   nz = aij->nz;
2088   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2089   Acsr = (CsrMatrix*)Acusp->mat->mat;
2090   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal,Acsr->values->data().get(),sizeof(PetscScalar)*nz,cudaMemcpyDeviceToDevice,PetscDefaultCudaStream));
2091 
2092   /* Factorize fact inplace */
2093   /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve
2094      Function csric02() only takes the lower triangular part of matrix A to perform factorization.
2095      The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored,
2096      and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not.
2097      In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided.
2098    */
2099   if (m) PetscCallCUSPARSE(cusparseXcsric02(fs->handle, m, nz,
2100                     fs->matDescr_M,
2101                     fs->csrVal,
2102                     fs->csrRowPtr,
2103                     fs->csrColIdx,
2104                     fs->ic0Info_M,
2105                     fs->policy_M,
2106                     fs->factBuffer_M));
2107   if (PetscDefined(USE_DEBUG)) {
2108     int              numerical_zero;
2109     cusparseStatus_t status;
2110     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero);
2111     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status,PETSC_COMM_SELF,PETSC_ERR_USER_INPUT,"Numerical zero pivot detected in csric02: A(%d,%d) is zero",numerical_zero,numerical_zero);
2112   }
2113 
2114   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle,
2115                     CUSPARSE_OPERATION_NON_TRANSPOSE,
2116                     &PETSC_CUSPARSE_ONE,
2117                     fs->spMatDescr_L,
2118                     fs->dnVecDescr_X,
2119                     fs->dnVecDescr_Y,
2120                     cusparse_scalartype,
2121                     CUSPARSE_SPSV_ALG_DEFAULT,
2122                     fs->spsvDescr_L,
2123                     fs->spsvBuffer_L));
2124 
2125   /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE
2126     ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F
2127   */
2128   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle,
2129                     CUSPARSE_OPERATION_TRANSPOSE,
2130                     &PETSC_CUSPARSE_ONE,
2131                     fs->spMatDescr_L,
2132                     fs->dnVecDescr_X,
2133                     fs->dnVecDescr_Y,
2134                     cusparse_scalartype,
2135                     CUSPARSE_SPSV_ALG_DEFAULT,
2136                     fs->spsvDescr_Lt,
2137                     fs->spsvBuffer_Lt));
2138 
2139   fact->offloadmask            = PETSC_OFFLOAD_GPU;
2140   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_ICC0;
2141   fact->ops->solvetranspose    = MatSolve_SeqAIJCUSPARSE_ICC0;
2142   fact->ops->matsolve          = NULL;
2143   fact->ops->matsolvetranspose = NULL;
2144   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
2145   PetscFunctionReturn(0);
2146 }
2147 
2148 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact,Mat A,IS perm,const MatFactorInfo *info)
2149 {
2150   Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors*)fact->spptr;
2151   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ*)fact->data;
2152   PetscInt                     m,nz;
2153 
2154   PetscFunctionBegin;
2155   if (PetscDefined(USE_DEBUG)) {
2156     PetscInt  i;
2157     PetscBool flg,missing;
2158 
2159     PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
2160     PetscCheck(flg,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Expected MATSEQAIJCUSPARSE, but input is %s",((PetscObject)A)->type_name);
2161     PetscCheck(A->rmap->n == A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT,A->rmap->n,A->cmap->n);
2162     PetscCall(MatMissingDiagonal(A,&missing,&i));
2163     PetscCheck(!missing,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %" PetscInt_FMT,i);
2164   }
2165 
2166   /* Free the old stale stuff */
2167   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
2168 
2169   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
2170      but they will not be used. Allocate them just for easy debugging.
2171    */
2172   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE/*malloc*/));
2173 
2174   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
2175   fact->factortype             = MAT_FACTOR_ICC;
2176   fact->info.factor_mallocs    = 0;
2177   fact->info.fill_ratio_given  = info->fill;
2178   fact->info.fill_ratio_needed = 1.0;
2179 
2180   aij->row = NULL;
2181   aij->col = NULL;
2182 
2183   /* ====================================================================== */
2184   /* Copy A's i, j to fact and also allocate the value array of fact.       */
2185   /* We'll do in-place factorization on fact                                */
2186   /* ====================================================================== */
2187   const int *Ai,*Aj;
2188 
2189   m  = fact->rmap->n;
2190   nz = aij->nz;
2191 
2192   PetscCallCUDA(cudaMalloc((void**)&fs->csrRowPtr,sizeof(int)*(m+1)));
2193   PetscCallCUDA(cudaMalloc((void**)&fs->csrColIdx,sizeof(int)*nz));
2194   PetscCallCUDA(cudaMalloc((void**)&fs->csrVal,sizeof(PetscScalar)*nz));
2195   PetscCall(MatSeqAIJCUSPARSEGetIJ(A,PETSC_FALSE,&Ai,&Aj));  /* Do not use compressed Ai */
2196   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr,Ai,sizeof(int)*(m+1),cudaMemcpyDeviceToDevice,PetscDefaultCudaStream));
2197   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx,Aj,sizeof(int)*nz,cudaMemcpyDeviceToDevice,PetscDefaultCudaStream));
2198 
2199   /* ====================================================================== */
2200   /* Create mat descriptors for M, L                                        */
2201   /* ====================================================================== */
2202   cusparseFillMode_t fillMode;
2203   cusparseDiagType_t diagType;
2204 
2205   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
2206   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
2207   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
2208 
2209   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
2210     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
2211     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
2212     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
2213     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
2214   */
2215   fillMode = CUSPARSE_FILL_MODE_LOWER;
2216   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
2217   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L,m,m,nz,
2218                     fs->csrRowPtr,
2219                     fs->csrColIdx,
2220                     fs->csrVal,
2221                     CUSPARSE_INDEX_32I,
2222                     CUSPARSE_INDEX_32I,
2223                     CUSPARSE_INDEX_BASE_ZERO,
2224                     cusparse_scalartype));
2225   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L,
2226                     CUSPARSE_SPMAT_FILL_MODE,
2227                     &fillMode,
2228                     sizeof(fillMode)));
2229   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L,
2230                     CUSPARSE_SPMAT_DIAG_TYPE,
2231                     &diagType,
2232                     sizeof(diagType)));
2233 
2234   /* ========================================================================= */
2235   /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers     */
2236   /* ========================================================================= */
2237   PetscCallCUSPARSE(cusparseCreateCsric02Info(&fs->ic0Info_M));
2238   if (m) PetscCallCUSPARSE(cusparseXcsric02_bufferSize(fs->handle, m, nz,
2239                     fs->matDescr_M,
2240                     fs->csrVal,
2241                     fs->csrRowPtr,
2242                     fs->csrColIdx,
2243                     fs->ic0Info_M,
2244                     &fs->factBufferSize_M));
2245 
2246   PetscCallCUDA(cudaMalloc((void**)&fs->X,sizeof(PetscScalar)*m));
2247   PetscCallCUDA(cudaMalloc((void**)&fs->Y,sizeof(PetscScalar)*m));
2248 
2249   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X,m,fs->X,cusparse_scalartype));
2250   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y,m,fs->Y,cusparse_scalartype));
2251 
2252   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
2253   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle,
2254                     CUSPARSE_OPERATION_NON_TRANSPOSE,
2255                     &PETSC_CUSPARSE_ONE,
2256                     fs->spMatDescr_L,
2257                     fs->dnVecDescr_X,
2258                     fs->dnVecDescr_Y,
2259                     cusparse_scalartype,
2260                     CUSPARSE_SPSV_ALG_DEFAULT,
2261                     fs->spsvDescr_L,
2262                     &fs->spsvBufferSize_L));
2263 
2264   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
2265   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle,
2266                     CUSPARSE_OPERATION_TRANSPOSE,
2267                     &PETSC_CUSPARSE_ONE,
2268                     fs->spMatDescr_L,
2269                     fs->dnVecDescr_X,
2270                     fs->dnVecDescr_Y,
2271                     cusparse_scalartype,
2272                     CUSPARSE_SPSV_ALG_DEFAULT,
2273                     fs->spsvDescr_Lt,
2274                     &fs->spsvBufferSize_Lt));
2275 
2276   PetscCallCUDA(cudaMalloc((void**)&fs->factBuffer_M,fs->factBufferSize_M));
2277   PetscCallCUDA(cudaMalloc((void**)&fs->spsvBuffer_L,fs->spsvBufferSize_L));
2278   PetscCallCUDA(cudaMalloc((void**)&fs->spsvBuffer_Lt,fs->spsvBufferSize_Lt));
2279 
2280   /* ========================================================================== */
2281   /* Perform analysis of ic0 on M                                               */
2282   /* The lower triangular part of M has the same sparsity pattern as L          */
2283   /* ========================================================================== */
2284   int              structural_zero;
2285   cusparseStatus_t status;
2286 
2287   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
2288   if (m) PetscCallCUSPARSE(cusparseXcsric02_analysis(fs->handle, m, nz,
2289                     fs->matDescr_M,
2290                     fs->csrVal,
2291                     fs->csrRowPtr,
2292                     fs->csrColIdx,
2293                     fs->ic0Info_M,
2294                     fs->policy_M,
2295                     fs->factBuffer_M));
2296   if (PetscDefined(USE_DEBUG)) {
2297     /* Function cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
2298     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero);
2299     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status,PETSC_COMM_SELF,PETSC_ERR_USER_INPUT,"Structural zero pivot detected in csric02: A(%d,%d) is missing",structural_zero,structural_zero);
2300   }
2301 
2302   /* Estimate FLOPs of the numeric factorization */
2303   Mat_SeqAIJ     *Aseq = (Mat_SeqAIJ*)A->data;
2304   PetscInt       nzRow,nzLeft;
2305   PetscLogDouble flops = 0.0;
2306 
2307   Ai = Aseq->i;
2308   for (PetscInt i=0; i<m; i++) {
2309     nzRow = Ai[i+1] - Ai[i];
2310     if (nzRow > 1) {
2311       /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
2312          and include the eliminated one will be updated, which incurs a multiplication and an addition.
2313       */
2314       nzLeft = (nzRow-1)/2;
2315       flops += nzLeft*(2.0*nzRow-nzLeft+1);
2316     }
2317   }
2318   fs->numericFactFlops = flops;
2319   fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0;
2320   PetscFunctionReturn(0);
2321 }
2322 #endif
2323 
2324 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
2325 {
2326   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
2327 
2328   PetscFunctionBegin;
2329  #if CUSPARSE_VERSION >= 11500
2330   PetscBool row_identity,col_identity;
2331   PetscCall(ISIdentity(isrow,&row_identity));
2332   PetscCall(ISIdentity(iscol,&col_identity));
2333   if (!info->levels && row_identity && col_identity) {
2334     PetscCall(MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B,A,isrow,iscol,info));
2335   } else
2336  #endif
2337   {
2338     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2339     PetscCall(MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info));
2340     B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2341   }
2342   PetscFunctionReturn(0);
2343 }
2344 
2345 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
2346 {
2347   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
2348 
2349   PetscFunctionBegin;
2350   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2351   PetscCall(MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info));
2352   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2353   PetscFunctionReturn(0);
2354 }
2355 
2356 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
2357 {
2358   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
2359 
2360   PetscFunctionBegin;
2361  #if CUSPARSE_VERSION >= 11500
2362   PetscBool perm_identity;
2363   PetscCall(ISIdentity(perm,&perm_identity));
2364   if (!info->levels && perm_identity) {
2365     PetscCall(MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B,A,perm,info));
2366   } else
2367  #endif
2368   {
2369     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2370     PetscCall(MatICCFactorSymbolic_SeqAIJ(B,A,perm,info));
2371     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
2372   }
2373   PetscFunctionReturn(0);
2374 }
2375 
2376 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
2377 {
2378   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
2379 
2380   PetscFunctionBegin;
2381   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2382   PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info));
2383   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
2384   PetscFunctionReturn(0);
2385 }
2386 
2387 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
2388 {
2389   PetscFunctionBegin;
2390   *type = MATSOLVERCUSPARSE;
2391   PetscFunctionReturn(0);
2392 }
2393 
2394 /*MC
2395   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
2396   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
2397   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
2398   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
2399   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
2400   algorithms are not recommended. This class does NOT support direct solver operations.
2401 
2402   Level: beginner
2403 
2404 .seealso: `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
2405 M*/
2406 
2407 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
2408 {
2409   PetscInt       n = A->rmap->n;
2410 
2411   PetscFunctionBegin;
2412   PetscCall(MatCreate(PetscObjectComm((PetscObject)A),B));
2413   PetscCall(MatSetSizes(*B,n,n,n,n));
2414   (*B)->factortype = ftype;
2415   PetscCall(MatSetType(*B,MATSEQAIJCUSPARSE));
2416 
2417   if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B,PETSC_TRUE));
2418   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
2419     PetscCall(MatSetBlockSizesFromMats(*B,A,A));
2420     if (!A->boundtocpu) {
2421       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
2422       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
2423     } else {
2424       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
2425       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJ;
2426     }
2427     PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]));
2428     PetscCall(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU]));
2429     PetscCall(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
2430   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
2431     if (!A->boundtocpu) {
2432       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
2433       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
2434     } else {
2435       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJ;
2436       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
2437     }
2438     PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
2439     PetscCall(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC]));
2440   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
2441 
2442   PetscCall(MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL));
2443   (*B)->canuseordering = PETSC_TRUE;
2444   PetscCall(PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse));
2445   PetscFunctionReturn(0);
2446 }
2447 
2448 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
2449 {
2450   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
2451   Mat_SeqAIJCUSPARSE           *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2452   Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
2453 
2454   PetscFunctionBegin;
2455   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
2456     PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0));
2457     if (A->factortype == MAT_FACTOR_NONE) {
2458       CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat;
2459       PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2460     }
2461    #if CUSPARSE_VERSION >= 13500
2462     else if (fs->csrVal) {
2463       /* We have a factorized matrix on device and are able to copy it to host */
2464       PetscCallCUDA(cudaMemcpy(a->a, fs->csrVal, a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2465     }
2466    #endif
2467     else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for copying this type of factorized matrix from device to host");
2468     PetscCall(PetscLogGpuToCpu(a->nz*sizeof(PetscScalar)));
2469     PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0));
2470     A->offloadmask = PETSC_OFFLOAD_BOTH;
2471   }
2472   PetscFunctionReturn(0);
2473 }
2474 
2475 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
2476 {
2477   PetscFunctionBegin;
2478   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2479   *array = ((Mat_SeqAIJ*)A->data)->a;
2480   PetscFunctionReturn(0);
2481 }
2482 
2483 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
2484 {
2485   PetscFunctionBegin;
2486   A->offloadmask = PETSC_OFFLOAD_CPU;
2487   *array         = NULL;
2488   PetscFunctionReturn(0);
2489 }
2490 
2491 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[])
2492 {
2493   PetscFunctionBegin;
2494   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2495   *array = ((Mat_SeqAIJ*)A->data)->a;
2496   PetscFunctionReturn(0);
2497 }
2498 
2499 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[])
2500 {
2501   PetscFunctionBegin;
2502   *array = NULL;
2503   PetscFunctionReturn(0);
2504 }
2505 
2506 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
2507 {
2508   PetscFunctionBegin;
2509   *array = ((Mat_SeqAIJ*)A->data)->a;
2510   PetscFunctionReturn(0);
2511 }
2512 
2513 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
2514 {
2515   PetscFunctionBegin;
2516   A->offloadmask = PETSC_OFFLOAD_CPU;
2517   *array         = NULL;
2518   PetscFunctionReturn(0);
2519 }
2520 
2521 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A,const PetscInt **i,const PetscInt **j,PetscScalar **a,PetscMemType *mtype)
2522 {
2523   Mat_SeqAIJCUSPARSE           *cusp;
2524   CsrMatrix                    *matrix;
2525 
2526   PetscFunctionBegin;
2527   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2528   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
2529   cusp = static_cast<Mat_SeqAIJCUSPARSE*>(A->spptr);
2530   PetscCheck(cusp != NULL,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"cusp is NULL");
2531   matrix = (CsrMatrix*)cusp->mat->mat;
2532 
2533   if (i) {
2534    #if !defined(PETSC_USE_64BIT_INDICES)
2535     *i = matrix->row_offsets->data().get();
2536    #else
2537     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSparse does not supported 64-bit indices");
2538    #endif
2539   }
2540   if (j) {
2541    #if !defined(PETSC_USE_64BIT_INDICES)
2542     *j = matrix->column_indices->data().get();
2543    #else
2544     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSparse does not supported 64-bit indices");
2545    #endif
2546   }
2547   if (a) *a = matrix->values->data().get();
2548   if (mtype) *mtype = PETSC_MEMTYPE_CUDA;
2549   PetscFunctionReturn(0);
2550 }
2551 
2552 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
2553 {
2554   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
2555   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
2556   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
2557   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
2558   cusparseStatus_t             stat;
2559   PetscBool                    both = PETSC_TRUE;
2560 
2561   PetscFunctionBegin;
2562   PetscCheck(!A->boundtocpu,PETSC_COMM_SELF,PETSC_ERR_GPU,"Cannot copy to GPU");
2563   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
2564     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
2565       CsrMatrix *matrix;
2566       matrix = (CsrMatrix*)cusparsestruct->mat->mat;
2567 
2568       PetscCheck(!a->nz || a->a,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR values");
2569       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0));
2570       matrix->values->assign(a->a, a->a+a->nz);
2571       PetscCallCUDA(WaitForCUDA());
2572       PetscCall(PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar)));
2573       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0));
2574       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE));
2575     } else {
2576       PetscInt nnz;
2577       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0));
2578       PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format));
2579       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE));
2580       delete cusparsestruct->workVector;
2581       delete cusparsestruct->rowoffsets_gpu;
2582       cusparsestruct->workVector = NULL;
2583       cusparsestruct->rowoffsets_gpu = NULL;
2584       try {
2585         if (a->compressedrow.use) {
2586           m    = a->compressedrow.nrows;
2587           ii   = a->compressedrow.i;
2588           ridx = a->compressedrow.rindex;
2589         } else {
2590           m    = A->rmap->n;
2591           ii   = a->i;
2592           ridx = NULL;
2593         }
2594         PetscCheck(ii,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR row data");
2595         if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; }
2596         else nnz = a->nz;
2597         PetscCheck(!nnz || a->j,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR column data");
2598 
2599         /* create cusparse matrix */
2600         cusparsestruct->nrows = m;
2601         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
2602         PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr));
2603         PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO));
2604         PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
2605 
2606         PetscCallCUDA(cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar)));
2607         PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar)));
2608         PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar)));
2609         PetscCallCUDA(cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
2610         PetscCallCUDA(cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice));
2611         PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
2612         PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE));
2613 
2614         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
2615         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
2616           /* set the matrix */
2617           CsrMatrix *mat= new CsrMatrix;
2618           mat->num_rows = m;
2619           mat->num_cols = A->cmap->n;
2620           mat->num_entries = nnz;
2621           mat->row_offsets = new THRUSTINTARRAY32(m+1);
2622           mat->row_offsets->assign(ii, ii + m+1);
2623 
2624           mat->column_indices = new THRUSTINTARRAY32(nnz);
2625           mat->column_indices->assign(a->j, a->j+nnz);
2626 
2627           mat->values = new THRUSTARRAY(nnz);
2628           if (a->a) mat->values->assign(a->a, a->a+nnz);
2629 
2630           /* assign the pointer */
2631           matstruct->mat = mat;
2632          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2633           if (mat->num_rows) { /* cusparse errors on empty matrices! */
2634             stat = cusparseCreateCsr(&matstruct->matDescr,
2635                                     mat->num_rows, mat->num_cols, mat->num_entries,
2636                                     mat->row_offsets->data().get(), mat->column_indices->data().get(),
2637                                     mat->values->data().get(),
2638                                     CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2639                                     CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);PetscCallCUSPARSE(stat);
2640           }
2641          #endif
2642         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
2643          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2644           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2645          #else
2646           CsrMatrix *mat= new CsrMatrix;
2647           mat->num_rows = m;
2648           mat->num_cols = A->cmap->n;
2649           mat->num_entries = nnz;
2650           mat->row_offsets = new THRUSTINTARRAY32(m+1);
2651           mat->row_offsets->assign(ii, ii + m+1);
2652 
2653           mat->column_indices = new THRUSTINTARRAY32(nnz);
2654           mat->column_indices->assign(a->j, a->j+nnz);
2655 
2656           mat->values = new THRUSTARRAY(nnz);
2657           if (a->a) mat->values->assign(a->a, a->a+nnz);
2658 
2659           cusparseHybMat_t hybMat;
2660           PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
2661           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
2662             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
2663           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
2664               matstruct->descr, mat->values->data().get(),
2665               mat->row_offsets->data().get(),
2666               mat->column_indices->data().get(),
2667               hybMat, 0, partition);PetscCallCUSPARSE(stat);
2668           /* assign the pointer */
2669           matstruct->mat = hybMat;
2670 
2671           if (mat) {
2672             if (mat->values) delete (THRUSTARRAY*)mat->values;
2673             if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
2674             if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
2675             delete (CsrMatrix*)mat;
2676           }
2677          #endif
2678         }
2679 
2680         /* assign the compressed row indices */
2681         if (a->compressedrow.use) {
2682           cusparsestruct->workVector = new THRUSTARRAY(m);
2683           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
2684           matstruct->cprowIndices->assign(ridx,ridx+m);
2685           tmp = m;
2686         } else {
2687           cusparsestruct->workVector = NULL;
2688           matstruct->cprowIndices    = NULL;
2689           tmp = 0;
2690         }
2691         PetscCall(PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar)));
2692 
2693         /* assign the pointer */
2694         cusparsestruct->mat = matstruct;
2695       } catch(char *ex) {
2696         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
2697       }
2698       PetscCallCUDA(WaitForCUDA());
2699       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0));
2700       cusparsestruct->nonzerostate = A->nonzerostate;
2701     }
2702     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
2703   }
2704   PetscFunctionReturn(0);
2705 }
2706 
2707 struct VecCUDAPlusEquals
2708 {
2709   template <typename Tuple>
2710   __host__ __device__
2711   void operator()(Tuple t)
2712   {
2713     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
2714   }
2715 };
2716 
2717 struct VecCUDAEquals
2718 {
2719   template <typename Tuple>
2720   __host__ __device__
2721   void operator()(Tuple t)
2722   {
2723     thrust::get<1>(t) = thrust::get<0>(t);
2724   }
2725 };
2726 
2727 struct VecCUDAEqualsReverse
2728 {
2729   template <typename Tuple>
2730   __host__ __device__
2731   void operator()(Tuple t)
2732   {
2733     thrust::get<0>(t) = thrust::get<1>(t);
2734   }
2735 };
2736 
2737 struct MatMatCusparse {
2738   PetscBool             cisdense;
2739   PetscScalar           *Bt;
2740   Mat                   X;
2741   PetscBool             reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
2742   PetscLogDouble        flops;
2743   CsrMatrix             *Bcsr;
2744 
2745 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2746   cusparseSpMatDescr_t  matSpBDescr;
2747   PetscBool             initialized;   /* C = alpha op(A) op(B) + beta C */
2748   cusparseDnMatDescr_t  matBDescr;
2749   cusparseDnMatDescr_t  matCDescr;
2750   PetscInt              Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
2751  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2752   void                  *dBuffer4;
2753   void                  *dBuffer5;
2754  #endif
2755   size_t                mmBufferSize;
2756   void                  *mmBuffer;
2757   void                  *mmBuffer2; /* SpGEMM WorkEstimation buffer */
2758   cusparseSpGEMMDescr_t spgemmDesc;
2759 #endif
2760 };
2761 
2762 static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
2763 {
2764   MatMatCusparse *mmdata = (MatMatCusparse *)data;
2765 
2766   PetscFunctionBegin;
2767   PetscCallCUDA(cudaFree(mmdata->Bt));
2768   delete mmdata->Bcsr;
2769  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2770   if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr));
2771   if (mmdata->matBDescr)   PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
2772   if (mmdata->matCDescr)   PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
2773   if (mmdata->spgemmDesc)  PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc));
2774  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2775   if (mmdata->dBuffer4)  PetscCallCUDA(cudaFree(mmdata->dBuffer4));
2776   if (mmdata->dBuffer5)  PetscCallCUDA(cudaFree(mmdata->dBuffer5));
2777  #endif
2778   if (mmdata->mmBuffer)  PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2779   if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2));
2780  #endif
2781   PetscCall(MatDestroy(&mmdata->X));
2782   PetscCall(PetscFree(data));
2783   PetscFunctionReturn(0);
2784 }
2785 
2786 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
2787 
2788 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2789 {
2790   Mat_Product                  *product = C->product;
2791   Mat                          A,B;
2792   PetscInt                     m,n,blda,clda;
2793   PetscBool                    flg,biscuda;
2794   Mat_SeqAIJCUSPARSE           *cusp;
2795   cusparseStatus_t             stat;
2796   cusparseOperation_t          opA;
2797   const PetscScalar            *barray;
2798   PetscScalar                  *carray;
2799   MatMatCusparse               *mmdata;
2800   Mat_SeqAIJCUSPARSEMultStruct *mat;
2801   CsrMatrix                    *csrmat;
2802 
2803   PetscFunctionBegin;
2804   MatCheckProduct(C,1);
2805   PetscCheck(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
2806   mmdata = (MatMatCusparse*)product->data;
2807   A    = product->A;
2808   B    = product->B;
2809   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
2810   PetscCheck(flg,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2811   /* currently CopyToGpu does not copy if the matrix is bound to CPU
2812      Instead of silently accepting the wrong answer, I prefer to raise the error */
2813   PetscCheck(!A->boundtocpu,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2814   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2815   cusp   = (Mat_SeqAIJCUSPARSE*)A->spptr;
2816   switch (product->type) {
2817   case MATPRODUCT_AB:
2818   case MATPRODUCT_PtAP:
2819     mat = cusp->mat;
2820     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2821     m   = A->rmap->n;
2822     n   = B->cmap->n;
2823     break;
2824   case MATPRODUCT_AtB:
2825     if (!A->form_explicit_transpose) {
2826       mat = cusp->mat;
2827       opA = CUSPARSE_OPERATION_TRANSPOSE;
2828     } else {
2829       PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2830       mat  = cusp->matTranspose;
2831       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
2832     }
2833     m = A->cmap->n;
2834     n = B->cmap->n;
2835     break;
2836   case MATPRODUCT_ABt:
2837   case MATPRODUCT_RARt:
2838     mat = cusp->mat;
2839     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2840     m   = A->rmap->n;
2841     n   = B->rmap->n;
2842     break;
2843   default:
2844     SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2845   }
2846   PetscCheck(mat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing Mat_SeqAIJCUSPARSEMultStruct");
2847   csrmat = (CsrMatrix*)mat->mat;
2848   /* if the user passed a CPU matrix, copy the data to the GPU */
2849   PetscCall(PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda));
2850   if (!biscuda) PetscCall(MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B));
2851   PetscCall(MatDenseCUDAGetArrayRead(B,&barray));
2852 
2853   PetscCall(MatDenseGetLDA(B,&blda));
2854   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2855     PetscCall(MatDenseCUDAGetArrayWrite(mmdata->X,&carray));
2856     PetscCall(MatDenseGetLDA(mmdata->X,&clda));
2857   } else {
2858     PetscCall(MatDenseCUDAGetArrayWrite(C,&carray));
2859     PetscCall(MatDenseGetLDA(C,&clda));
2860   }
2861 
2862   PetscCall(PetscLogGpuTimeBegin());
2863  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2864   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2865   /* (re)allocate mmBuffer if not initialized or LDAs are different */
2866   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2867     size_t mmBufferSize;
2868     if (mmdata->initialized && mmdata->Blda != blda) {PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); mmdata->matBDescr = NULL;}
2869     if (!mmdata->matBDescr) {
2870       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL));
2871       mmdata->Blda = blda;
2872     }
2873 
2874     if (mmdata->initialized && mmdata->Clda != clda) {PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); mmdata->matCDescr = NULL;}
2875     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2876       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL));
2877       mmdata->Clda = clda;
2878     }
2879 
2880     if (!mat->matDescr) {
2881       stat = cusparseCreateCsr(&mat->matDescr,
2882                                csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
2883                                csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
2884                                csrmat->values->data().get(),
2885                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2886                                CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);PetscCallCUSPARSE(stat);
2887     }
2888     stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
2889                                    mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2890                                    mmdata->matCDescr,cusparse_scalartype,
2891                                    cusp->spmmAlg,&mmBufferSize);PetscCallCUSPARSE(stat);
2892     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2893       PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2894       PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer,mmBufferSize));
2895       mmdata->mmBufferSize = mmBufferSize;
2896     }
2897     mmdata->initialized = PETSC_TRUE;
2898   } else {
2899     /* to be safe, always update pointers of the mats */
2900     PetscCallCUSPARSE(cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get()));
2901     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray));
2902     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray));
2903   }
2904 
2905   /* do cusparseSpMM, which supports transpose on B */
2906   stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
2907                       mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2908                       mmdata->matCDescr,cusparse_scalartype,
2909                       cusp->spmmAlg,mmdata->mmBuffer);PetscCallCUSPARSE(stat);
2910  #else
2911   PetscInt k;
2912   /* cusparseXcsrmm does not support transpose on B */
2913   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2914     cublasHandle_t cublasv2handle;
2915     cublasStatus_t cerr;
2916 
2917     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
2918     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
2919                        B->cmap->n,B->rmap->n,
2920                        &PETSC_CUSPARSE_ONE ,barray,blda,
2921                        &PETSC_CUSPARSE_ZERO,barray,blda,
2922                        mmdata->Bt,B->cmap->n);PetscCallCUBLAS(cerr);
2923     blda = B->cmap->n;
2924     k    = B->cmap->n;
2925   } else {
2926     k    = B->rmap->n;
2927   }
2928 
2929   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2930   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
2931                            csrmat->num_entries,mat->alpha_one,mat->descr,
2932                            csrmat->values->data().get(),
2933                            csrmat->row_offsets->data().get(),
2934                            csrmat->column_indices->data().get(),
2935                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
2936                            carray,clda);PetscCallCUSPARSE(stat);
2937  #endif
2938   PetscCall(PetscLogGpuTimeEnd());
2939   PetscCall(PetscLogGpuFlops(n*2.0*csrmat->num_entries));
2940   PetscCall(MatDenseCUDARestoreArrayRead(B,&barray));
2941   if (product->type == MATPRODUCT_RARt) {
2942     PetscCall(MatDenseCUDARestoreArrayWrite(mmdata->X,&carray));
2943     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE));
2944   } else if (product->type == MATPRODUCT_PtAP) {
2945     PetscCall(MatDenseCUDARestoreArrayWrite(mmdata->X,&carray));
2946     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE));
2947   } else {
2948     PetscCall(MatDenseCUDARestoreArrayWrite(C,&carray));
2949   }
2950   if (mmdata->cisdense) {
2951     PetscCall(MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C));
2952   }
2953   if (!biscuda) {
2954     PetscCall(MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B));
2955   }
2956   PetscFunctionReturn(0);
2957 }
2958 
2959 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2960 {
2961   Mat_Product        *product = C->product;
2962   Mat                A,B;
2963   PetscInt           m,n;
2964   PetscBool          cisdense,flg;
2965   MatMatCusparse     *mmdata;
2966   Mat_SeqAIJCUSPARSE *cusp;
2967 
2968   PetscFunctionBegin;
2969   MatCheckProduct(C,1);
2970   PetscCheck(!C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
2971   A    = product->A;
2972   B    = product->B;
2973   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
2974   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2975   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2976   PetscCheck(cusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2977   switch (product->type) {
2978   case MATPRODUCT_AB:
2979     m = A->rmap->n;
2980     n = B->cmap->n;
2981     break;
2982   case MATPRODUCT_AtB:
2983     m = A->cmap->n;
2984     n = B->cmap->n;
2985     break;
2986   case MATPRODUCT_ABt:
2987     m = A->rmap->n;
2988     n = B->rmap->n;
2989     break;
2990   case MATPRODUCT_PtAP:
2991     m = B->cmap->n;
2992     n = B->cmap->n;
2993     break;
2994   case MATPRODUCT_RARt:
2995     m = B->rmap->n;
2996     n = B->rmap->n;
2997     break;
2998   default:
2999     SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
3000   }
3001   PetscCall(MatSetSizes(C,m,n,m,n));
3002   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
3003   PetscCall(PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense));
3004   PetscCall(MatSetType(C,MATSEQDENSECUDA));
3005 
3006   /* product data */
3007   PetscCall(PetscNew(&mmdata));
3008   mmdata->cisdense = cisdense;
3009  #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
3010   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
3011   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
3012     PetscCallCUDA(cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar)));
3013   }
3014  #endif
3015   /* for these products we need intermediate storage */
3016   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
3017     PetscCall(MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X));
3018     PetscCall(MatSetType(mmdata->X,MATSEQDENSECUDA));
3019     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
3020       PetscCall(MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n));
3021     } else {
3022       PetscCall(MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n));
3023     }
3024   }
3025   C->product->data    = mmdata;
3026   C->product->destroy = MatDestroy_MatMatCusparse;
3027 
3028   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
3029   PetscFunctionReturn(0);
3030 }
3031 
3032 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
3033 {
3034   Mat_Product                  *product = C->product;
3035   Mat                          A,B;
3036   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
3037   Mat_SeqAIJ                   *c = (Mat_SeqAIJ*)C->data;
3038   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
3039   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
3040   PetscBool                    flg;
3041   cusparseStatus_t             stat;
3042   MatProductType               ptype;
3043   MatMatCusparse               *mmdata;
3044 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3045   cusparseSpMatDescr_t         BmatSpDescr;
3046 #endif
3047   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
3048 
3049   PetscFunctionBegin;
3050   MatCheckProduct(C,1);
3051   PetscCheck(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
3052   PetscCall(PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg));
3053   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for C of type %s",((PetscObject)C)->type_name);
3054   mmdata = (MatMatCusparse*)C->product->data;
3055   A = product->A;
3056   B = product->B;
3057   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
3058     mmdata->reusesym = PETSC_FALSE;
3059     Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
3060     PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
3061     Cmat = Ccusp->mat;
3062     PetscCheck(Cmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]);
3063     Ccsr = (CsrMatrix*)Cmat->mat;
3064     PetscCheck(Ccsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
3065     goto finalize;
3066   }
3067   if (!c->nz) goto finalize;
3068   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
3069   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
3070   PetscCall(PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg));
3071   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
3072   PetscCheck(!A->boundtocpu,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
3073   PetscCheck(!B->boundtocpu,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
3074   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3075   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
3076   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
3077   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
3078   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
3079   PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
3080   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3081   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
3082 
3083   ptype = product->type;
3084   if (A->symmetric && ptype == MATPRODUCT_AtB) {
3085     ptype = MATPRODUCT_AB;
3086     PetscCheck(product->symbolic_used_the_fact_A_is_symmetric,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that A is symmetric");
3087   }
3088   if (B->symmetric && ptype == MATPRODUCT_ABt) {
3089     ptype = MATPRODUCT_AB;
3090     PetscCheck(product->symbolic_used_the_fact_B_is_symmetric,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that B is symmetric");
3091   }
3092   switch (ptype) {
3093   case MATPRODUCT_AB:
3094     Amat = Acusp->mat;
3095     Bmat = Bcusp->mat;
3096     break;
3097   case MATPRODUCT_AtB:
3098     Amat = Acusp->matTranspose;
3099     Bmat = Bcusp->mat;
3100     break;
3101   case MATPRODUCT_ABt:
3102     Amat = Acusp->mat;
3103     Bmat = Bcusp->matTranspose;
3104     break;
3105   default:
3106     SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
3107   }
3108   Cmat = Ccusp->mat;
3109   PetscCheck(Amat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
3110   PetscCheck(Bmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
3111   PetscCheck(Cmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[ptype]);
3112   Acsr = (CsrMatrix*)Amat->mat;
3113   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */
3114   Ccsr = (CsrMatrix*)Cmat->mat;
3115   PetscCheck(Acsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
3116   PetscCheck(Bcsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
3117   PetscCheck(Ccsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
3118   PetscCall(PetscLogGpuTimeBegin());
3119 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3120   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
3121   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3122   #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
3123     stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB,
3124                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
3125                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
3126                                mmdata->spgemmDesc);PetscCallCUSPARSE(stat);
3127   #else
3128     stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
3129                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
3130                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
3131                                mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);PetscCallCUSPARSE(stat);
3132     stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB,
3133                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
3134                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);PetscCallCUSPARSE(stat);
3135   #endif
3136 #else
3137   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB,
3138                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
3139                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
3140                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
3141                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());PetscCallCUSPARSE(stat);
3142 #endif
3143   PetscCall(PetscLogGpuFlops(mmdata->flops));
3144   PetscCallCUDA(WaitForCUDA());
3145   PetscCall(PetscLogGpuTimeEnd());
3146   C->offloadmask = PETSC_OFFLOAD_GPU;
3147 finalize:
3148   /* shorter version of MatAssemblyEnd_SeqAIJ */
3149   PetscCall(PetscInfo(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz));
3150   PetscCall(PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n"));
3151   PetscCall(PetscInfo(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax));
3152   c->reallocs         = 0;
3153   C->info.mallocs    += 0;
3154   C->info.nz_unneeded = 0;
3155   C->assembled = C->was_assembled = PETSC_TRUE;
3156   C->num_ass++;
3157   PetscFunctionReturn(0);
3158 }
3159 
3160 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
3161 {
3162   Mat_Product                  *product = C->product;
3163   Mat                          A,B;
3164   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
3165   Mat_SeqAIJ                   *a,*b,*c;
3166   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
3167   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
3168   PetscInt                     i,j,m,n,k;
3169   PetscBool                    flg;
3170   cusparseStatus_t             stat;
3171   MatProductType               ptype;
3172   MatMatCusparse               *mmdata;
3173   PetscLogDouble               flops;
3174   PetscBool                    biscompressed,ciscompressed;
3175 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3176   int64_t                      C_num_rows1, C_num_cols1, C_nnz1;
3177   cusparseSpMatDescr_t         BmatSpDescr;
3178 #else
3179   int                          cnz;
3180 #endif
3181   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
3182 
3183   PetscFunctionBegin;
3184   MatCheckProduct(C,1);
3185   PetscCheck(!C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
3186   A    = product->A;
3187   B    = product->B;
3188   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
3189   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
3190   PetscCall(PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg));
3191   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
3192   a = (Mat_SeqAIJ*)A->data;
3193   b = (Mat_SeqAIJ*)B->data;
3194   /* product data */
3195   PetscCall(PetscNew(&mmdata));
3196   C->product->data    = mmdata;
3197   C->product->destroy = MatDestroy_MatMatCusparse;
3198 
3199   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3200   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
3201   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */
3202   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
3203   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
3204   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
3205 
3206   ptype = product->type;
3207   if (A->symmetric && ptype == MATPRODUCT_AtB) {
3208     ptype = MATPRODUCT_AB;
3209     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
3210   }
3211   if (B->symmetric && ptype == MATPRODUCT_ABt) {
3212     ptype = MATPRODUCT_AB;
3213     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
3214   }
3215   biscompressed = PETSC_FALSE;
3216   ciscompressed = PETSC_FALSE;
3217   switch (ptype) {
3218   case MATPRODUCT_AB:
3219     m = A->rmap->n;
3220     n = B->cmap->n;
3221     k = A->cmap->n;
3222     Amat = Acusp->mat;
3223     Bmat = Bcusp->mat;
3224     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
3225     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
3226     break;
3227   case MATPRODUCT_AtB:
3228     m = A->cmap->n;
3229     n = B->cmap->n;
3230     k = A->rmap->n;
3231     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3232     Amat = Acusp->matTranspose;
3233     Bmat = Bcusp->mat;
3234     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
3235     break;
3236   case MATPRODUCT_ABt:
3237     m = A->rmap->n;
3238     n = B->rmap->n;
3239     k = A->cmap->n;
3240     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
3241     Amat = Acusp->mat;
3242     Bmat = Bcusp->matTranspose;
3243     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
3244     break;
3245   default:
3246     SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
3247   }
3248 
3249   /* create cusparse matrix */
3250   PetscCall(MatSetSizes(C,m,n,m,n));
3251   PetscCall(MatSetType(C,MATSEQAIJCUSPARSE));
3252   c     = (Mat_SeqAIJ*)C->data;
3253   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
3254   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
3255   Ccsr  = new CsrMatrix;
3256 
3257   c->compressedrow.use = ciscompressed;
3258   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
3259     c->compressedrow.nrows = a->compressedrow.nrows;
3260     PetscCall(PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex));
3261     PetscCall(PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows));
3262     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
3263     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
3264     Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows);
3265   } else {
3266     c->compressedrow.nrows  = 0;
3267     c->compressedrow.i      = NULL;
3268     c->compressedrow.rindex = NULL;
3269     Ccusp->workVector       = NULL;
3270     Cmat->cprowIndices      = NULL;
3271   }
3272   Ccusp->nrows    = ciscompressed ? c->compressedrow.nrows : m;
3273   Ccusp->mat      = Cmat;
3274   Ccusp->mat->mat = Ccsr;
3275   Ccsr->num_rows    = Ccusp->nrows;
3276   Ccsr->num_cols    = n;
3277   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1);
3278   PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
3279   PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
3280   PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
3281   PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar)));
3282   PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar)));
3283   PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar)));
3284   PetscCallCUDA(cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
3285   PetscCallCUDA(cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice));
3286   PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
3287   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
3288     thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0);
3289     c->nz = 0;
3290     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3291     Ccsr->values = new THRUSTARRAY(c->nz);
3292     goto finalizesym;
3293   }
3294 
3295   PetscCheck(Amat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
3296   PetscCheck(Bmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
3297   Acsr = (CsrMatrix*)Amat->mat;
3298   if (!biscompressed) {
3299     Bcsr = (CsrMatrix*)Bmat->mat;
3300 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3301     BmatSpDescr = Bmat->matDescr;
3302 #endif
3303   } else { /* we need to use row offsets for the full matrix */
3304     CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat;
3305     Bcsr = new CsrMatrix;
3306     Bcsr->num_rows       = B->rmap->n;
3307     Bcsr->num_cols       = cBcsr->num_cols;
3308     Bcsr->num_entries    = cBcsr->num_entries;
3309     Bcsr->column_indices = cBcsr->column_indices;
3310     Bcsr->values         = cBcsr->values;
3311     if (!Bcusp->rowoffsets_gpu) {
3312       Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
3313       Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
3314       PetscCall(PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt)));
3315     }
3316     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
3317     mmdata->Bcsr = Bcsr;
3318 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3319     if (Bcsr->num_rows && Bcsr->num_cols) {
3320       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries,
3321                                Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
3322                                Bcsr->values->data().get(),
3323                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
3324                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat);
3325     }
3326     BmatSpDescr = mmdata->matSpBDescr;
3327 #endif
3328   }
3329   PetscCheck(Acsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
3330   PetscCheck(Bcsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
3331   /* precompute flops count */
3332   if (ptype == MATPRODUCT_AB) {
3333     for (i=0, flops = 0; i<A->rmap->n; i++) {
3334       const PetscInt st = a->i[i];
3335       const PetscInt en = a->i[i+1];
3336       for (j=st; j<en; j++) {
3337         const PetscInt brow = a->j[j];
3338         flops += 2.*(b->i[brow+1] - b->i[brow]);
3339       }
3340     }
3341   } else if (ptype == MATPRODUCT_AtB) {
3342     for (i=0, flops = 0; i<A->rmap->n; i++) {
3343       const PetscInt anzi = a->i[i+1] - a->i[i];
3344       const PetscInt bnzi = b->i[i+1] - b->i[i];
3345       flops += (2.*anzi)*bnzi;
3346     }
3347   } else { /* TODO */
3348     flops = 0.;
3349   }
3350 
3351   mmdata->flops = flops;
3352   PetscCall(PetscLogGpuTimeBegin());
3353 
3354 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3355   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3356   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0,
3357                           NULL, NULL, NULL,
3358                           CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
3359                           CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat);
3360   PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc));
3361  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
3362  {
3363   /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
3364      We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
3365   */
3366   void*  dBuffer1 = NULL;
3367   void*  dBuffer2 = NULL;
3368   void*  dBuffer3 = NULL;
3369   /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
3370   size_t bufferSize1 = 0;
3371   size_t bufferSize2 = 0;
3372   size_t bufferSize3 = 0;
3373   size_t bufferSize4 = 0;
3374   size_t bufferSize5 = 0;
3375 
3376   /*----------------------------------------------------------------------*/
3377   /* ask bufferSize1 bytes for external memory */
3378   stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
3379                                             CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
3380                                             &bufferSize1, NULL);PetscCallCUSPARSE(stat);
3381   PetscCallCUDA(cudaMalloc((void**) &dBuffer1, bufferSize1));
3382   /* inspect the matrices A and B to understand the memory requirement for the next step */
3383   stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
3384                                             CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
3385                                             &bufferSize1, dBuffer1);PetscCallCUSPARSE(stat);
3386 
3387   /*----------------------------------------------------------------------*/
3388   stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
3389                                  CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
3390                                  &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);PetscCallCUSPARSE(stat);
3391   PetscCallCUDA(cudaMalloc((void**) &dBuffer2, bufferSize2));
3392   PetscCallCUDA(cudaMalloc((void**) &dBuffer3, bufferSize3));
3393   PetscCallCUDA(cudaMalloc((void**) &mmdata->dBuffer4, bufferSize4));
3394   stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
3395                                  CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
3396                                  &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);PetscCallCUSPARSE(stat);
3397   PetscCallCUDA(cudaFree(dBuffer1));
3398   PetscCallCUDA(cudaFree(dBuffer2));
3399 
3400   /*----------------------------------------------------------------------*/
3401   /* get matrix C non-zero entries C_nnz1 */
3402   PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
3403   c->nz = (PetscInt) C_nnz1;
3404   /* allocate matrix C */
3405   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3406   Ccsr->values         = new THRUSTARRAY(c->nz);PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3407   /* update matC with the new pointers */
3408   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
3409                                 Ccsr->values->data().get());PetscCallCUSPARSE(stat);
3410 
3411   /*----------------------------------------------------------------------*/
3412   stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
3413                                   CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
3414                                   &bufferSize5, NULL);PetscCallCUSPARSE(stat);
3415   PetscCallCUDA(cudaMalloc((void**) &mmdata->dBuffer5, bufferSize5));
3416   stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
3417                                   CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
3418                                   &bufferSize5, mmdata->dBuffer5);PetscCallCUSPARSE(stat);
3419   PetscCallCUDA(cudaFree(dBuffer3));
3420   stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB,
3421                                      Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
3422                                      cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
3423                                      mmdata->spgemmDesc);PetscCallCUSPARSE(stat);
3424   PetscCall(PetscInfo(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufferSize4/1024,bufferSize5/1024));
3425  }
3426  #else
3427   size_t bufSize2;
3428   /* ask bufferSize bytes for external memory */
3429   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB,
3430                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
3431                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
3432                                        mmdata->spgemmDesc, &bufSize2, NULL);PetscCallCUSPARSE(stat);
3433   PetscCallCUDA(cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2));
3434   /* inspect the matrices A and B to understand the memory requirement for the next step */
3435   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB,
3436                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
3437                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
3438                                        mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);PetscCallCUSPARSE(stat);
3439   /* ask bufferSize again bytes for external memory */
3440   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
3441                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
3442                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
3443                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);PetscCallCUSPARSE(stat);
3444   /* The CUSPARSE documentation is not clear, nor the API
3445      We need both buffers to perform the operations properly!
3446      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
3447      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
3448      is stored in the descriptor! What a messy API... */
3449   PetscCallCUDA(cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize));
3450   /* compute the intermediate product of A * B */
3451   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
3452                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
3453                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
3454                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);PetscCallCUSPARSE(stat);
3455   /* get matrix C non-zero entries C_nnz1 */
3456   PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
3457   c->nz = (PetscInt) C_nnz1;
3458   PetscCall(PetscInfo(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024));
3459   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3460   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3461   Ccsr->values = new THRUSTARRAY(c->nz);
3462   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3463   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
3464                                 Ccsr->values->data().get());PetscCallCUSPARSE(stat);
3465   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB,
3466                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
3467                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);PetscCallCUSPARSE(stat);
3468  #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0)
3469 #else
3470   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST));
3471   stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB,
3472                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
3473                              Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
3474                              Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
3475                              Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);PetscCallCUSPARSE(stat);
3476   c->nz = cnz;
3477   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3478   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3479   Ccsr->values = new THRUSTARRAY(c->nz);
3480   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3481 
3482   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3483   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
3484      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
3485      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
3486   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB,
3487                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
3488                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
3489                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
3490                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());PetscCallCUSPARSE(stat);
3491 #endif
3492   PetscCall(PetscLogGpuFlops(mmdata->flops));
3493   PetscCall(PetscLogGpuTimeEnd());
3494 finalizesym:
3495   c->singlemalloc = PETSC_FALSE;
3496   c->free_a       = PETSC_TRUE;
3497   c->free_ij      = PETSC_TRUE;
3498   PetscCall(PetscMalloc1(m+1,&c->i));
3499   PetscCall(PetscMalloc1(c->nz,&c->j));
3500   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
3501     PetscInt *d_i = c->i;
3502     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
3503     THRUSTINTARRAY jj(Ccsr->column_indices->size());
3504     ii   = *Ccsr->row_offsets;
3505     jj   = *Ccsr->column_indices;
3506     if (ciscompressed) d_i = c->compressedrow.i;
3507     PetscCallCUDA(cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
3508     PetscCallCUDA(cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
3509   } else {
3510     PetscInt *d_i = c->i;
3511     if (ciscompressed) d_i = c->compressedrow.i;
3512     PetscCallCUDA(cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
3513     PetscCallCUDA(cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
3514   }
3515   if (ciscompressed) { /* need to expand host row offsets */
3516     PetscInt r = 0;
3517     c->i[0] = 0;
3518     for (k = 0; k < c->compressedrow.nrows; k++) {
3519       const PetscInt next = c->compressedrow.rindex[k];
3520       const PetscInt old = c->compressedrow.i[k];
3521       for (; r < next; r++) c->i[r+1] = old;
3522     }
3523     for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows];
3524   }
3525   PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt)));
3526   PetscCall(PetscMalloc1(m,&c->ilen));
3527   PetscCall(PetscMalloc1(m,&c->imax));
3528   c->maxnz = c->nz;
3529   c->nonzerorowcnt = 0;
3530   c->rmax = 0;
3531   for (k = 0; k < m; k++) {
3532     const PetscInt nn = c->i[k+1] - c->i[k];
3533     c->ilen[k] = c->imax[k] = nn;
3534     c->nonzerorowcnt += (PetscInt)!!nn;
3535     c->rmax = PetscMax(c->rmax,nn);
3536   }
3537   PetscCall(MatMarkDiagonal_SeqAIJ(C));
3538   PetscCall(PetscMalloc1(c->nz,&c->a));
3539   Ccsr->num_entries = c->nz;
3540 
3541   C->nonzerostate++;
3542   PetscCall(PetscLayoutSetUp(C->rmap));
3543   PetscCall(PetscLayoutSetUp(C->cmap));
3544   Ccusp->nonzerostate = C->nonzerostate;
3545   C->offloadmask   = PETSC_OFFLOAD_UNALLOCATED;
3546   C->preallocated  = PETSC_TRUE;
3547   C->assembled     = PETSC_FALSE;
3548   C->was_assembled = PETSC_FALSE;
3549   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
3550     mmdata->reusesym = PETSC_TRUE;
3551     C->offloadmask   = PETSC_OFFLOAD_GPU;
3552   }
3553   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3554   PetscFunctionReturn(0);
3555 }
3556 
3557 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
3558 
3559 /* handles sparse or dense B */
3560 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
3561 {
3562   Mat_Product    *product = mat->product;
3563   PetscBool      isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE;
3564 
3565   PetscFunctionBegin;
3566   MatCheckProduct(mat,1);
3567   PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense));
3568   if (!product->A->boundtocpu && !product->B->boundtocpu) {
3569     PetscCall(PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp));
3570   }
3571   if (product->type == MATPRODUCT_ABC) {
3572     Ciscusp = PETSC_FALSE;
3573     if (!product->C->boundtocpu) {
3574       PetscCall(PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp));
3575     }
3576   }
3577   if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
3578     PetscBool usecpu = PETSC_FALSE;
3579     switch (product->type) {
3580     case MATPRODUCT_AB:
3581       if (product->api_user) {
3582         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMult","Mat");
3583         PetscCall(PetscOptionsBool("-matmatmult_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL));
3584         PetscOptionsEnd();
3585       } else {
3586         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AB","Mat");
3587         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL));
3588         PetscOptionsEnd();
3589       }
3590       break;
3591     case MATPRODUCT_AtB:
3592       if (product->api_user) {
3593         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatTransposeMatMult","Mat");
3594         PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL));
3595         PetscOptionsEnd();
3596       } else {
3597         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AtB","Mat");
3598         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL));
3599         PetscOptionsEnd();
3600       }
3601       break;
3602     case MATPRODUCT_PtAP:
3603       if (product->api_user) {
3604         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatPtAP","Mat");
3605         PetscCall(PetscOptionsBool("-matptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL));
3606         PetscOptionsEnd();
3607       } else {
3608         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_PtAP","Mat");
3609         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL));
3610         PetscOptionsEnd();
3611       }
3612       break;
3613     case MATPRODUCT_RARt:
3614       if (product->api_user) {
3615         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatRARt","Mat");
3616         PetscCall(PetscOptionsBool("-matrart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL));
3617         PetscOptionsEnd();
3618       } else {
3619         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_RARt","Mat");
3620         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL));
3621         PetscOptionsEnd();
3622       }
3623       break;
3624     case MATPRODUCT_ABC:
3625       if (product->api_user) {
3626         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMatMult","Mat");
3627         PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL));
3628         PetscOptionsEnd();
3629       } else {
3630         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_ABC","Mat");
3631         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL));
3632         PetscOptionsEnd();
3633       }
3634       break;
3635     default:
3636       break;
3637     }
3638     if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
3639   }
3640   /* dispatch */
3641   if (isdense) {
3642     switch (product->type) {
3643     case MATPRODUCT_AB:
3644     case MATPRODUCT_AtB:
3645     case MATPRODUCT_ABt:
3646     case MATPRODUCT_PtAP:
3647     case MATPRODUCT_RARt:
3648      if (product->A->boundtocpu) {
3649         PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat));
3650       } else {
3651         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
3652       }
3653       break;
3654     case MATPRODUCT_ABC:
3655       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3656       break;
3657     default:
3658       break;
3659     }
3660   } else if (Biscusp && Ciscusp) {
3661     switch (product->type) {
3662     case MATPRODUCT_AB:
3663     case MATPRODUCT_AtB:
3664     case MATPRODUCT_ABt:
3665       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3666       break;
3667     case MATPRODUCT_PtAP:
3668     case MATPRODUCT_RARt:
3669     case MATPRODUCT_ABC:
3670       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3671       break;
3672     default:
3673       break;
3674     }
3675   } else { /* fallback for AIJ */
3676     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
3677   }
3678   PetscFunctionReturn(0);
3679 }
3680 
3681 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
3682 {
3683   PetscFunctionBegin;
3684   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE));
3685   PetscFunctionReturn(0);
3686 }
3687 
3688 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
3689 {
3690   PetscFunctionBegin;
3691   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE));
3692   PetscFunctionReturn(0);
3693 }
3694 
3695 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
3696 {
3697   PetscFunctionBegin;
3698   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE));
3699   PetscFunctionReturn(0);
3700 }
3701 
3702 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
3703 {
3704   PetscFunctionBegin;
3705   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE));
3706   PetscFunctionReturn(0);
3707 }
3708 
3709 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
3710 {
3711   PetscFunctionBegin;
3712   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE));
3713   PetscFunctionReturn(0);
3714 }
3715 
3716 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y)
3717 {
3718   int i = blockIdx.x*blockDim.x + threadIdx.x;
3719   if (i < n) y[idx[i]] += x[i];
3720 }
3721 
3722 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
3723 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
3724 {
3725   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
3726   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
3727   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
3728   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
3729   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
3730   PetscBool                    compressed;
3731 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3732   PetscInt                     nx,ny;
3733 #endif
3734 
3735   PetscFunctionBegin;
3736   PetscCheck(!herm || trans,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Hermitian and not transpose not supported");
3737   if (!a->nz) {
3738     if (!yy) PetscCall(VecSet_SeqCUDA(zz,0));
3739     else PetscCall(VecCopy_SeqCUDA(yy,zz));
3740     PetscFunctionReturn(0);
3741   }
3742   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
3743   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3744   if (!trans) {
3745     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
3746     PetscCheck(matstruct,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
3747   } else {
3748     if (herm || !A->form_explicit_transpose) {
3749       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
3750       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
3751     } else {
3752       if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3753       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
3754     }
3755   }
3756   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3757   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
3758 
3759   try {
3760     PetscCall(VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray));
3761     if (yy == zz) PetscCall(VecCUDAGetArray(zz,&zarray)); /* read & write zz, so need to get uptodate zarray on GPU */
3762     else PetscCall(VecCUDAGetArrayWrite(zz,&zarray)); /* write zz, so no need to init zarray on GPU */
3763 
3764     PetscCall(PetscLogGpuTimeBegin());
3765     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3766       /* z = A x + beta y.
3767          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3768          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3769       */
3770       xptr = xarray;
3771       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
3772       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3773      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3774       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3775           allocated to accommodate different uses. So we get the length info directly from mat.
3776        */
3777       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3778         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3779         nx = mat->num_cols;
3780         ny = mat->num_rows;
3781       }
3782      #endif
3783     } else {
3784       /* z = A^T x + beta y
3785          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3786          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3787        */
3788       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
3789       dptr = zarray;
3790       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3791       if (compressed) { /* Scatter x to work vector */
3792         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3793         thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
3794                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3795                          VecCUDAEqualsReverse());
3796       }
3797      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3798       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3799         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3800         nx = mat->num_rows;
3801         ny = mat->num_cols;
3802       }
3803      #endif
3804     }
3805 
3806     /* csr_spmv does y = alpha op(A) x + beta y */
3807     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3808      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3809       PetscCheck(opA >= 0 && opA <= 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
3810       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
3811         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype));
3812         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype));
3813         PetscCallCUSPARSE(cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
3814                                                matstruct->matDescr,
3815                                                matstruct->cuSpMV[opA].vecXDescr, beta,
3816                                                matstruct->cuSpMV[opA].vecYDescr,
3817                                                cusparse_scalartype,
3818                                                cusparsestruct->spmvAlg,
3819                                                &matstruct->cuSpMV[opA].spmvBufferSize));
3820         PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize));
3821 
3822         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3823       } else {
3824         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
3825         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr));
3826         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr));
3827       }
3828 
3829       PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA,
3830                                   matstruct->alpha_one,
3831                                   matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */
3832                                   matstruct->cuSpMV[opA].vecXDescr,
3833                                   beta,
3834                                   matstruct->cuSpMV[opA].vecYDescr,
3835                                   cusparse_scalartype,
3836                                   cusparsestruct->spmvAlg,
3837                                   matstruct->cuSpMV[opA].spmvBuffer));
3838      #else
3839       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3840       PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA,
3841                                        mat->num_rows, mat->num_cols,
3842                                        mat->num_entries, matstruct->alpha_one, matstruct->descr,
3843                                        mat->values->data().get(), mat->row_offsets->data().get(),
3844                                        mat->column_indices->data().get(), xptr, beta,
3845                                        dptr));
3846      #endif
3847     } else {
3848       if (cusparsestruct->nrows) {
3849        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3850         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3851        #else
3852         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
3853         PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA,
3854                                          matstruct->alpha_one, matstruct->descr, hybMat,
3855                                          xptr, beta,
3856                                          dptr));
3857        #endif
3858       }
3859     }
3860     PetscCall(PetscLogGpuTimeEnd());
3861 
3862     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3863       if (yy) { /* MatMultAdd: zz = A*xx + yy */
3864         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3865           PetscCall(VecCopy_SeqCUDA(yy,zz)); /* zz = yy */
3866         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3867           PetscCall(VecAXPY_SeqCUDA(zz,1.0,yy)); /* zz += yy */
3868         }
3869       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3870         PetscCall(VecSet_SeqCUDA(zz,0));
3871       }
3872 
3873       /* ScatterAdd the result from work vector into the full vector when A is compressed */
3874       if (compressed) {
3875         PetscCall(PetscLogGpuTimeBegin());
3876         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred)
3877            and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
3878            prevent that. So I just add a ScatterAdd kernel.
3879          */
3880        #if 0
3881         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
3882         thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
3883                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
3884                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3885                          VecCUDAPlusEquals());
3886        #else
3887         PetscInt n = matstruct->cprowIndices->size();
3888         ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray);
3889        #endif
3890         PetscCall(PetscLogGpuTimeEnd());
3891       }
3892     } else {
3893       if (yy && yy != zz) {
3894         PetscCall(VecAXPY_SeqCUDA(zz,1.0,yy)); /* zz += yy */
3895       }
3896     }
3897     PetscCall(VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray));
3898     if (yy == zz) PetscCall(VecCUDARestoreArray(zz,&zarray));
3899     else PetscCall(VecCUDARestoreArrayWrite(zz,&zarray));
3900   } catch(char *ex) {
3901     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
3902   }
3903   if (yy) {
3904     PetscCall(PetscLogGpuFlops(2.0*a->nz));
3905   } else {
3906     PetscCall(PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt));
3907   }
3908   PetscFunctionReturn(0);
3909 }
3910 
3911 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
3912 {
3913   PetscFunctionBegin;
3914   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE));
3915   PetscFunctionReturn(0);
3916 }
3917 
3918 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
3919 {
3920   PetscObjectState   onnz = A->nonzerostate;
3921   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3922 
3923   PetscFunctionBegin;
3924   PetscCall(MatAssemblyEnd_SeqAIJ(A,mode));
3925   if (onnz != A->nonzerostate && cusp->deviceMat) {
3926 
3927     PetscCall(PetscInfo(A,"Destroy device mat since nonzerostate changed\n"));
3928     PetscCallCUDA(cudaFree(cusp->deviceMat));
3929     cusp->deviceMat = NULL;
3930   }
3931   PetscFunctionReturn(0);
3932 }
3933 
3934 /* --------------------------------------------------------------------------------*/
3935 /*@
3936    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
3937    (the default parallel PETSc format). This matrix will ultimately pushed down
3938    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
3939    assembly performance the user should preallocate the matrix storage by setting
3940    the parameter nz (or the array nnz).  By setting these parameters accurately,
3941    performance during matrix assembly can be increased by more than a factor of 50.
3942 
3943    Collective
3944 
3945    Input Parameters:
3946 +  comm - MPI communicator, set to PETSC_COMM_SELF
3947 .  m - number of rows
3948 .  n - number of columns
3949 .  nz - number of nonzeros per row (same for all rows)
3950 -  nnz - array containing the number of nonzeros in the various rows
3951          (possibly different for each row) or NULL
3952 
3953    Output Parameter:
3954 .  A - the matrix
3955 
3956    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
3957    MatXXXXSetPreallocation() paradgm instead of this routine directly.
3958    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
3959 
3960    Notes:
3961    If nnz is given then nz is ignored
3962 
3963    The AIJ format (also called the Yale sparse matrix format or
3964    compressed row storage), is fully compatible with standard Fortran 77
3965    storage.  That is, the stored row and column indices can begin at
3966    either one (as in Fortran) or zero.  See the users' manual for details.
3967 
3968    Specify the preallocated storage with either nz or nnz (not both).
3969    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
3970    allocation.  For large problems you MUST preallocate memory or you
3971    will get TERRIBLE performance, see the users' manual chapter on matrices.
3972 
3973    By default, this format uses inodes (identical nodes) when possible, to
3974    improve numerical efficiency of matrix-vector products and solves. We
3975    search for consecutive rows with the same nonzero structure, thereby
3976    reusing matrix information to achieve increased efficiency.
3977 
3978    Level: intermediate
3979 
3980 .seealso: `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATSEQAIJCUSPARSE`, `MATAIJCUSPARSE`
3981 @*/
3982 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
3983 {
3984   PetscFunctionBegin;
3985   PetscCall(MatCreate(comm,A));
3986   PetscCall(MatSetSizes(*A,m,n,m,n));
3987   PetscCall(MatSetType(*A,MATSEQAIJCUSPARSE));
3988   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz));
3989   PetscFunctionReturn(0);
3990 }
3991 
3992 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3993 {
3994   PetscFunctionBegin;
3995   if (A->factortype == MAT_FACTOR_NONE) {
3996     PetscCall(MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr));
3997   } else {
3998     PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr));
3999   }
4000   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL));
4001   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL));
4002   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetUseCPUSolve_C",NULL));
4003   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL));
4004   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL));
4005   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL));
4006   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL));
4007   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL));
4008   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL));
4009   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaijcusparse_hypre_C",NULL));
4010   PetscCall(MatDestroy_SeqAIJ(A));
4011   PetscFunctionReturn(0);
4012 }
4013 
4014 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
4015 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
4016 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
4017 {
4018   PetscFunctionBegin;
4019   PetscCall(MatDuplicate_SeqAIJ(A,cpvalues,B));
4020   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B));
4021   PetscFunctionReturn(0);
4022 }
4023 
4024 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str)
4025 {
4026   Mat_SeqAIJ         *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
4027   Mat_SeqAIJCUSPARSE *cy;
4028   Mat_SeqAIJCUSPARSE *cx;
4029   PetscScalar        *ay;
4030   const PetscScalar  *ax;
4031   CsrMatrix          *csry,*csrx;
4032 
4033   PetscFunctionBegin;
4034   cy = (Mat_SeqAIJCUSPARSE*)Y->spptr;
4035   cx = (Mat_SeqAIJCUSPARSE*)X->spptr;
4036   if (X->ops->axpy != Y->ops->axpy) {
4037     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE));
4038     PetscCall(MatAXPY_SeqAIJ(Y,a,X,str));
4039     PetscFunctionReturn(0);
4040   }
4041   /* if we are here, it means both matrices are bound to GPU */
4042   PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y));
4043   PetscCall(MatSeqAIJCUSPARSECopyToGPU(X));
4044   PetscCheck(cy->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)Y),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
4045   PetscCheck(cx->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)X),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
4046   csry = (CsrMatrix*)cy->mat->mat;
4047   csrx = (CsrMatrix*)cx->mat->mat;
4048   /* see if we can turn this into a cublas axpy */
4049   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
4050     bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin());
4051     if (eq) {
4052       eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin());
4053     }
4054     if (eq) str = SAME_NONZERO_PATTERN;
4055   }
4056   /* spgeam is buggy with one column */
4057   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
4058 
4059   if (str == SUBSET_NONZERO_PATTERN) {
4060     PetscScalar b = 1.0;
4061 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4062     size_t      bufferSize;
4063     void        *buffer;
4064 #endif
4065 
4066     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X,&ax));
4067     PetscCall(MatSeqAIJCUSPARSEGetArray(Y,&ay));
4068     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST));
4069 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4070     PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n,
4071                                                   &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
4072                                                   &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
4073                                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize));
4074     PetscCallCUDA(cudaMalloc(&buffer,bufferSize));
4075     PetscCall(PetscLogGpuTimeBegin());
4076     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
4077                                        &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
4078                                        &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
4079                                        cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer));
4080     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
4081     PetscCall(PetscLogGpuTimeEnd());
4082     PetscCallCUDA(cudaFree(buffer));
4083 #else
4084     PetscCall(PetscLogGpuTimeBegin());
4085     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
4086                                        &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
4087                                        &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
4088                                        cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get()));
4089     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
4090     PetscCall(PetscLogGpuTimeEnd());
4091 #endif
4092     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE));
4093     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X,&ax));
4094     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y,&ay));
4095     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
4096   } else if (str == SAME_NONZERO_PATTERN) {
4097     cublasHandle_t cublasv2handle;
4098     PetscBLASInt   one = 1, bnz = 1;
4099 
4100     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X,&ax));
4101     PetscCall(MatSeqAIJCUSPARSEGetArray(Y,&ay));
4102     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
4103     PetscCall(PetscBLASIntCast(x->nz,&bnz));
4104     PetscCall(PetscLogGpuTimeBegin());
4105     PetscCallCUBLAS(cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one));
4106     PetscCall(PetscLogGpuFlops(2.0*bnz));
4107     PetscCall(PetscLogGpuTimeEnd());
4108     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X,&ax));
4109     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y,&ay));
4110     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
4111   } else {
4112     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE));
4113     PetscCall(MatAXPY_SeqAIJ(Y,a,X,str));
4114   }
4115   PetscFunctionReturn(0);
4116 }
4117 
4118 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a)
4119 {
4120   Mat_SeqAIJ     *y = (Mat_SeqAIJ*)Y->data;
4121   PetscScalar    *ay;
4122   cublasHandle_t cublasv2handle;
4123   PetscBLASInt   one = 1, bnz = 1;
4124 
4125   PetscFunctionBegin;
4126   PetscCall(MatSeqAIJCUSPARSEGetArray(Y,&ay));
4127   PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
4128   PetscCall(PetscBLASIntCast(y->nz,&bnz));
4129   PetscCall(PetscLogGpuTimeBegin());
4130   PetscCallCUBLAS(cublasXscal(cublasv2handle,bnz,&a,ay,one));
4131   PetscCall(PetscLogGpuFlops(bnz));
4132   PetscCall(PetscLogGpuTimeEnd());
4133   PetscCall(MatSeqAIJCUSPARSERestoreArray(Y,&ay));
4134   PetscCall(MatSeqAIJInvalidateDiagonal(Y));
4135   PetscFunctionReturn(0);
4136 }
4137 
4138 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
4139 {
4140   PetscBool      both = PETSC_FALSE;
4141   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
4142 
4143   PetscFunctionBegin;
4144   if (A->factortype == MAT_FACTOR_NONE) {
4145     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
4146     if (spptr->mat) {
4147       CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat;
4148       if (matrix->values) {
4149         both = PETSC_TRUE;
4150         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
4151       }
4152     }
4153     if (spptr->matTranspose) {
4154       CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat;
4155       if (matrix->values) {
4156         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
4157       }
4158     }
4159   }
4160   PetscCall(PetscArrayzero(a->a,a->i[A->rmap->n]));
4161   PetscCall(MatSeqAIJInvalidateDiagonal(A));
4162   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
4163   else A->offloadmask = PETSC_OFFLOAD_CPU;
4164   PetscFunctionReturn(0);
4165 }
4166 
4167 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
4168 {
4169   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
4170 
4171   PetscFunctionBegin;
4172   if (A->factortype != MAT_FACTOR_NONE) {
4173     A->boundtocpu = flg;
4174     PetscFunctionReturn(0);
4175   }
4176   if (flg) {
4177     PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
4178 
4179     A->ops->scale                     = MatScale_SeqAIJ;
4180     A->ops->axpy                      = MatAXPY_SeqAIJ;
4181     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
4182     A->ops->mult                      = MatMult_SeqAIJ;
4183     A->ops->multadd                   = MatMultAdd_SeqAIJ;
4184     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
4185     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
4186     A->ops->multhermitiantranspose    = NULL;
4187     A->ops->multhermitiantransposeadd = NULL;
4188     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
4189     PetscCall(PetscMemzero(a->ops,sizeof(Mat_SeqAIJOps)));
4190     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL));
4191     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL));
4192     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL));
4193     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL));
4194     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL));
4195     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL));
4196   } else {
4197     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
4198     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
4199     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
4200     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
4201     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
4202     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
4203     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
4204     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
4205     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
4206     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
4207     a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJCUSPARSE;
4208     a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJCUSPARSE;
4209     a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE;
4210     a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE;
4211     a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE;
4212     a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE;
4213     a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE;
4214 
4215     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE));
4216     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE));
4217     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE));
4218     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE));
4219     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE));
4220     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE));
4221    }
4222   A->boundtocpu = flg;
4223   if (flg && a->inode.size) {
4224     a->inode.use = PETSC_TRUE;
4225   } else {
4226     a->inode.use = PETSC_FALSE;
4227   }
4228   PetscFunctionReturn(0);
4229 }
4230 
4231 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
4232 {
4233   Mat              B;
4234 
4235   PetscFunctionBegin;
4236   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */
4237   if (reuse == MAT_INITIAL_MATRIX) {
4238     PetscCall(MatDuplicate(A,MAT_COPY_VALUES,newmat));
4239   } else if (reuse == MAT_REUSE_MATRIX) {
4240     PetscCall(MatCopy(A,*newmat,SAME_NONZERO_PATTERN));
4241   }
4242   B = *newmat;
4243 
4244   PetscCall(PetscFree(B->defaultvectype));
4245   PetscCall(PetscStrallocpy(VECCUDA,&B->defaultvectype));
4246 
4247   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
4248     if (B->factortype == MAT_FACTOR_NONE) {
4249       Mat_SeqAIJCUSPARSE *spptr;
4250       PetscCall(PetscNew(&spptr));
4251       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
4252       PetscCallCUSPARSE(cusparseSetStream(spptr->handle,PetscDefaultCudaStream));
4253       spptr->format     = MAT_CUSPARSE_CSR;
4254      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4255      #if CUSPARSE_VERSION > 11301
4256       spptr->spmvAlg    = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
4257      #else
4258       spptr->spmvAlg    = CUSPARSE_CSRMV_ALG1;    /* default, since we only support csr */
4259      #endif
4260       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
4261       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
4262      #endif
4263       B->spptr = spptr;
4264     } else {
4265       Mat_SeqAIJCUSPARSETriFactors *spptr;
4266 
4267       PetscCall(PetscNew(&spptr));
4268       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
4269       PetscCallCUSPARSE(cusparseSetStream(spptr->handle,PetscDefaultCudaStream));
4270       B->spptr = spptr;
4271     }
4272     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
4273   }
4274   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
4275   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
4276   B->ops->setoption      = MatSetOption_SeqAIJCUSPARSE;
4277   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
4278   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
4279   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
4280 
4281   PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE));
4282   PetscCall(PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE));
4283   PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE));
4284 #if defined(PETSC_HAVE_HYPRE)
4285   PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijcusparse_hypre_C",MatConvert_AIJ_HYPRE));
4286 #endif
4287   PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetUseCPUSolve_C",MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE));
4288   PetscFunctionReturn(0);
4289 }
4290 
4291 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
4292 {
4293   PetscFunctionBegin;
4294   PetscCall(MatCreate_SeqAIJ(B));
4295   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B));
4296   PetscFunctionReturn(0);
4297 }
4298 
4299 /*MC
4300    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
4301 
4302    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
4303    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
4304    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
4305 
4306    Options Database Keys:
4307 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
4308 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
4309 -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
4310 +  -mat_cusparse_use_cpu_solve - Do MatSolve on CPU
4311 
4312   Level: beginner
4313 
4314 .seealso: `MatCreateSeqAIJCUSPARSE()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
4315 M*/
4316 
4317 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*);
4318 
4319 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
4320 {
4321   PetscFunctionBegin;
4322   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSEBAND,MATSEQAIJ,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band));
4323   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse));
4324   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse));
4325   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse));
4326   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse));
4327 
4328   PetscFunctionReturn(0);
4329 }
4330 
4331 static PetscErrorCode MatResetPreallocationCOO_SeqAIJCUSPARSE(Mat mat)
4332 {
4333   Mat_SeqAIJCUSPARSE* cusp = (Mat_SeqAIJCUSPARSE*)mat->spptr;
4334 
4335   PetscFunctionBegin;
4336   if (!cusp) PetscFunctionReturn(0);
4337   delete cusp->cooPerm;
4338   delete cusp->cooPerm_a;
4339   cusp->cooPerm = NULL;
4340   cusp->cooPerm_a = NULL;
4341   if (cusp->use_extended_coo) {
4342     PetscCallCUDA(cudaFree(cusp->jmap_d));
4343     PetscCallCUDA(cudaFree(cusp->perm_d));
4344   }
4345   cusp->use_extended_coo = PETSC_FALSE;
4346   PetscFunctionReturn(0);
4347 }
4348 
4349 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
4350 {
4351   PetscFunctionBegin;
4352   if (*cusparsestruct) {
4353     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format));
4354     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format));
4355     delete (*cusparsestruct)->workVector;
4356     delete (*cusparsestruct)->rowoffsets_gpu;
4357     delete (*cusparsestruct)->cooPerm;
4358     delete (*cusparsestruct)->cooPerm_a;
4359     delete (*cusparsestruct)->csr2csc_i;
4360     if ((*cusparsestruct)->handle) PetscCallCUSPARSE(cusparseDestroy((*cusparsestruct)->handle));
4361     if ((*cusparsestruct)->jmap_d) PetscCallCUDA(cudaFree((*cusparsestruct)->jmap_d));
4362     if ((*cusparsestruct)->perm_d) PetscCallCUDA(cudaFree((*cusparsestruct)->perm_d));
4363     PetscCall(PetscFree(*cusparsestruct));
4364   }
4365   PetscFunctionReturn(0);
4366 }
4367 
4368 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
4369 {
4370   PetscFunctionBegin;
4371   if (*mat) {
4372     delete (*mat)->values;
4373     delete (*mat)->column_indices;
4374     delete (*mat)->row_offsets;
4375     delete *mat;
4376     *mat = 0;
4377   }
4378   PetscFunctionReturn(0);
4379 }
4380 
4381 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
4382 {
4383   PetscFunctionBegin;
4384   if (*trifactor) {
4385     if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr));
4386     if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparseDestroyCsrsvInfo((*trifactor)->solveInfo));
4387     PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat));
4388     if ((*trifactor)->solveBuffer)   PetscCallCUDA(cudaFree((*trifactor)->solveBuffer));
4389     if ((*trifactor)->AA_h)   PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h));
4390    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4391     if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer));
4392    #endif
4393     PetscCall(PetscFree(*trifactor));
4394   }
4395   PetscFunctionReturn(0);
4396 }
4397 
4398 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
4399 {
4400   CsrMatrix        *mat;
4401 
4402   PetscFunctionBegin;
4403   if (*matstruct) {
4404     if ((*matstruct)->mat) {
4405       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
4406        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4407         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
4408        #else
4409         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
4410         PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat));
4411        #endif
4412       } else {
4413         mat = (CsrMatrix*)(*matstruct)->mat;
4414         CsrMatrix_Destroy(&mat);
4415       }
4416     }
4417     if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr));
4418     delete (*matstruct)->cprowIndices;
4419     if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one));
4420     if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero));
4421     if ((*matstruct)->beta_one)  PetscCallCUDA(cudaFree((*matstruct)->beta_one));
4422 
4423    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4424     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
4425     if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr));
4426     for (int i=0; i<3; i++) {
4427       if (mdata->cuSpMV[i].initialized) {
4428         PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer));
4429         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr));
4430         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr));
4431       }
4432     }
4433    #endif
4434     delete *matstruct;
4435     *matstruct = NULL;
4436   }
4437   PetscFunctionReturn(0);
4438 }
4439 
4440 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p* trifactors)
4441 {
4442   Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors;
4443 
4444   PetscFunctionBegin;
4445   if (fs) {
4446     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtr));
4447     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtr));
4448     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose));
4449     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose));
4450     delete fs->rpermIndices;
4451     delete fs->cpermIndices;
4452     delete fs->workVector;
4453     fs->rpermIndices = NULL;
4454     fs->cpermIndices = NULL;
4455     fs->workVector = NULL;
4456     if (fs->a_band_d)   PetscCallCUDA(cudaFree(fs->a_band_d));
4457     if (fs->i_band_d)   PetscCallCUDA(cudaFree(fs->i_band_d));
4458     fs->init_dev_prop = PETSC_FALSE;
4459    #if CUSPARSE_VERSION >= 11500
4460     PetscCallCUDA(cudaFree(fs->csrRowPtr));
4461     PetscCallCUDA(cudaFree(fs->csrColIdx));
4462     PetscCallCUDA(cudaFree(fs->csrVal));
4463     PetscCallCUDA(cudaFree(fs->X));
4464     PetscCallCUDA(cudaFree(fs->Y));
4465     PetscCallCUDA(cudaFree(fs->factBuffer_M));
4466     PetscCallCUDA(cudaFree(fs->spsvBuffer_L));
4467     PetscCallCUDA(cudaFree(fs->spsvBuffer_Lt));
4468     PetscCallCUDA(cudaFree(fs->spsvBuffer_U));
4469     PetscCallCUDA(cudaFree(fs->spsvBuffer_Ut));
4470     PetscCallCUSPARSE(cusparseDestroyMatDescr(fs->matDescr_M));
4471     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_L));
4472     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_U));
4473     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_L));
4474     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Lt));
4475     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_U));
4476     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Ut));
4477     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_X));
4478     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_Y));
4479     PetscCallCUSPARSE(cusparseDestroyCsrilu02Info(fs->ilu0Info_M));
4480     PetscCallCUSPARSE(cusparseDestroyCsric02Info(fs->ic0Info_M));
4481     fs->builtSolveTranspose = PETSC_FALSE;
4482    #endif
4483   }
4484   PetscFunctionReturn(0);
4485 }
4486 
4487 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
4488 {
4489   cusparseHandle_t handle;
4490 
4491   PetscFunctionBegin;
4492   if (*trifactors) {
4493     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors));
4494     if (handle = (*trifactors)->handle) {
4495       PetscCallCUSPARSE(cusparseDestroy(handle));
4496     }
4497     PetscCall(PetscFree(*trifactors));
4498   }
4499   PetscFunctionReturn(0);
4500 }
4501 
4502 struct IJCompare
4503 {
4504   __host__ __device__
4505   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
4506   {
4507     if (t1.get<0>() < t2.get<0>()) return true;
4508     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
4509     return false;
4510   }
4511 };
4512 
4513 struct IJEqual
4514 {
4515   __host__ __device__
4516   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
4517   {
4518     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
4519     return true;
4520   }
4521 };
4522 
4523 struct IJDiff
4524 {
4525   __host__ __device__
4526   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
4527   {
4528     return t1 == t2 ? 0 : 1;
4529   }
4530 };
4531 
4532 struct IJSum
4533 {
4534   __host__ __device__
4535   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
4536   {
4537     return t1||t2;
4538   }
4539 };
4540 
4541 #include <thrust/iterator/discard_iterator.h>
4542 /* Associated with MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic() */
4543 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode)
4544 {
4545   Mat_SeqAIJCUSPARSE                    *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4546   Mat_SeqAIJ                            *a = (Mat_SeqAIJ*)A->data;
4547   THRUSTARRAY                           *cooPerm_v = NULL;
4548   thrust::device_ptr<const PetscScalar> d_v;
4549   CsrMatrix                             *matrix;
4550   PetscInt                              n;
4551 
4552   PetscFunctionBegin;
4553   PetscCheck(cusp,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct");
4554   PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix");
4555   if (!cusp->cooPerm) {
4556     PetscCall(MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY));
4557     PetscCall(MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY));
4558     PetscFunctionReturn(0);
4559   }
4560   matrix = (CsrMatrix*)cusp->mat->mat;
4561   PetscCheck(matrix->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4562   if (!v) {
4563     if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
4564     goto finalize;
4565   }
4566   n = cusp->cooPerm->size();
4567   if (isCudaMem(v)) {
4568     d_v = thrust::device_pointer_cast(v);
4569   } else {
4570     cooPerm_v = new THRUSTARRAY(n);
4571     cooPerm_v->assign(v,v+n);
4572     d_v = cooPerm_v->data();
4573     PetscCall(PetscLogCpuToGpu(n*sizeof(PetscScalar)));
4574   }
4575   PetscCall(PetscLogGpuTimeBegin());
4576   if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
4577     if (cusp->cooPerm_a) { /* there are repeated entries in d_v[], and we need to add these them */
4578       THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
4579       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
4580       /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output)
4581         cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[].
4582         cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero.
4583       */
4584       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
4585       thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>());
4586       delete cooPerm_w;
4587     } else {
4588       /* all nonzeros in d_v[] are unique entries */
4589       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
4590                                                                 matrix->values->begin()));
4591       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
4592                                                                 matrix->values->end()));
4593       thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]]  */
4594     }
4595   } else {
4596     if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
4597       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
4598       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
4599     } else {
4600       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
4601                                                                 matrix->values->begin()));
4602       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
4603                                                                 matrix->values->end()));
4604       thrust::for_each(zibit,zieit,VecCUDAEquals());
4605     }
4606   }
4607   PetscCall(PetscLogGpuTimeEnd());
4608 finalize:
4609   delete cooPerm_v;
4610   A->offloadmask = PETSC_OFFLOAD_GPU;
4611   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4612   /* shorter version of MatAssemblyEnd_SeqAIJ */
4613   PetscCall(PetscInfo(A,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",A->rmap->n,A->cmap->n,a->nz));
4614   PetscCall(PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n"));
4615   PetscCall(PetscInfo(A,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",a->rmax));
4616   a->reallocs         = 0;
4617   A->info.mallocs    += 0;
4618   A->info.nz_unneeded = 0;
4619   A->assembled = A->was_assembled = PETSC_TRUE;
4620   A->num_ass++;
4621   PetscFunctionReturn(0);
4622 }
4623 
4624 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
4625 {
4626   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4627 
4628   PetscFunctionBegin;
4629   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4630   if (!cusp) PetscFunctionReturn(0);
4631   if (destroy) {
4632     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format));
4633     delete cusp->csr2csc_i;
4634     cusp->csr2csc_i = NULL;
4635   }
4636   A->transupdated = PETSC_FALSE;
4637   PetscFunctionReturn(0);
4638 }
4639 
4640 #include <thrust/binary_search.h>
4641 /* 'Basic' means it only works when coo_i[] and coo_j[] do not contain negative indices */
4642 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(Mat A, PetscCount n, const PetscInt coo_i[], const PetscInt coo_j[])
4643 {
4644   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4645   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
4646   PetscInt           cooPerm_n, nzr = 0;
4647 
4648   PetscFunctionBegin;
4649   PetscCall(PetscLayoutSetUp(A->rmap));
4650   PetscCall(PetscLayoutSetUp(A->cmap));
4651   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
4652   if (n != cooPerm_n) {
4653     delete cusp->cooPerm;
4654     delete cusp->cooPerm_a;
4655     cusp->cooPerm = NULL;
4656     cusp->cooPerm_a = NULL;
4657   }
4658   if (n) {
4659     THRUSTINTARRAY d_i(n);
4660     THRUSTINTARRAY d_j(n);
4661     THRUSTINTARRAY ii(A->rmap->n);
4662 
4663     if (!cusp->cooPerm)   { cusp->cooPerm   = new THRUSTINTARRAY(n); }
4664     if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); }
4665 
4666     PetscCall(PetscLogCpuToGpu(2.*n*sizeof(PetscInt)));
4667     d_i.assign(coo_i,coo_i+n);
4668     d_j.assign(coo_j,coo_j+n);
4669 
4670     /* Ex.
4671       n = 6
4672       coo_i = [3,3,1,4,1,4]
4673       coo_j = [3,2,2,5,2,6]
4674     */
4675     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin()));
4676     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end()));
4677 
4678     PetscCall(PetscLogGpuTimeBegin());
4679     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
4680     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */
4681     *cusp->cooPerm_a = d_i; /* copy the sorted array */
4682     THRUSTINTARRAY w = d_j;
4683 
4684     /*
4685       d_i     = [1,1,3,3,4,4]
4686       d_j     = [2,2,2,3,5,6]
4687       cooPerm = [2,4,1,0,3,5]
4688     */
4689     auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */
4690 
4691     /*
4692       d_i     = [1,3,3,4,4,x]
4693                             ^ekey
4694       d_j     = [2,2,3,5,6,x]
4695                            ^nekye
4696     */
4697     if (nekey == ekey) { /* all entries are unique */
4698       delete cusp->cooPerm_a;
4699       cusp->cooPerm_a = NULL;
4700     } else { /* Stefano: I couldn't come up with a more elegant algorithm */
4701       /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */
4702       adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/
4703       adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff());                                              /* w:         [2,2,2,3,5,6] => [2,0,0,1,1,1]*/
4704       (*cusp->cooPerm_a)[0] = 0; /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */
4705       w[0] = 0;
4706       thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); /* cooPerm_a =          [0,0,1,1,1,1]*/
4707       thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/
4708     }
4709     thrust::counting_iterator<PetscInt> search_begin(0);
4710     thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */
4711                         search_begin, search_begin + A->rmap->n,  /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */
4712                         ii.begin()); /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */
4713     PetscCall(PetscLogGpuTimeEnd());
4714 
4715     PetscCall(MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i));
4716     a->singlemalloc = PETSC_FALSE;
4717     a->free_a       = PETSC_TRUE;
4718     a->free_ij      = PETSC_TRUE;
4719     PetscCall(PetscMalloc1(A->rmap->n+1,&a->i));
4720     a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */
4721     PetscCallCUDA(cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost));
4722     a->nz = a->maxnz = a->i[A->rmap->n];
4723     a->rmax = 0;
4724     PetscCall(PetscMalloc1(a->nz,&a->a));
4725     PetscCall(PetscMalloc1(a->nz,&a->j));
4726     PetscCallCUDA(cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost));
4727     if (!a->ilen) PetscCall(PetscMalloc1(A->rmap->n,&a->ilen));
4728     if (!a->imax) PetscCall(PetscMalloc1(A->rmap->n,&a->imax));
4729     for (PetscInt i = 0; i < A->rmap->n; i++) {
4730       const PetscInt nnzr = a->i[i+1] - a->i[i];
4731       nzr += (PetscInt)!!(nnzr);
4732       a->ilen[i] = a->imax[i] = nnzr;
4733       a->rmax = PetscMax(a->rmax,nnzr);
4734     }
4735     a->nonzerorowcnt = nzr;
4736     A->preallocated = PETSC_TRUE;
4737     PetscCall(PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt)));
4738     PetscCall(MatMarkDiagonal_SeqAIJ(A));
4739   } else {
4740     PetscCall(MatSeqAIJSetPreallocation(A,0,NULL));
4741   }
4742   PetscCall(MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE));
4743 
4744   /* We want to allocate the CUSPARSE struct for matvec now.
4745      The code is so convoluted now that I prefer to copy zeros */
4746   PetscCall(PetscArrayzero(a->a,a->nz));
4747   PetscCall(MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6));
4748   A->offloadmask = PETSC_OFFLOAD_CPU;
4749   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4750   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE));
4751   PetscFunctionReturn(0);
4752 }
4753 
4754 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[])
4755 {
4756   Mat_SeqAIJ         *seq;
4757   Mat_SeqAIJCUSPARSE *dev;
4758   PetscBool          coo_basic = PETSC_TRUE;
4759   PetscMemType       mtype = PETSC_MEMTYPE_DEVICE;
4760 
4761   PetscFunctionBegin;
4762   PetscCall(MatResetPreallocationCOO_SeqAIJ(mat));
4763   PetscCall(MatResetPreallocationCOO_SeqAIJCUSPARSE(mat));
4764   if (coo_i) {
4765     PetscCall(PetscGetMemType(coo_i,&mtype));
4766     if (PetscMemTypeHost(mtype)) {
4767       for (PetscCount k=0; k<coo_n; k++) {
4768         if (coo_i[k] < 0 || coo_j[k] < 0) {coo_basic = PETSC_FALSE; break;}
4769       }
4770     }
4771   }
4772 
4773   if (coo_basic) { /* i,j are on device or do not contain negative indices */
4774     PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(mat,coo_n,coo_i,coo_j));
4775   } else {
4776     PetscCall(MatSetPreallocationCOO_SeqAIJ(mat,coo_n,coo_i,coo_j));
4777     mat->offloadmask = PETSC_OFFLOAD_CPU;
4778     PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat));
4779     seq  = static_cast<Mat_SeqAIJ*>(mat->data);
4780     dev  = static_cast<Mat_SeqAIJCUSPARSE*>(mat->spptr);
4781     PetscCallCUDA(cudaMalloc((void**)&dev->jmap_d,(seq->nz+1)*sizeof(PetscCount)));
4782     PetscCallCUDA(cudaMemcpy(dev->jmap_d,seq->jmap,(seq->nz+1)*sizeof(PetscCount),cudaMemcpyHostToDevice));
4783     PetscCallCUDA(cudaMalloc((void**)&dev->perm_d,seq->Atot*sizeof(PetscCount)));
4784     PetscCallCUDA(cudaMemcpy(dev->perm_d,seq->perm,seq->Atot*sizeof(PetscCount),cudaMemcpyHostToDevice));
4785     dev->use_extended_coo = PETSC_TRUE;
4786   }
4787   PetscFunctionReturn(0);
4788 }
4789 
4790 __global__ static void MatAddCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount jmap[],const PetscCount perm[],InsertMode imode,PetscScalar a[])
4791 {
4792   PetscCount        i = blockIdx.x*blockDim.x + threadIdx.x;
4793   const PetscCount  grid_size = gridDim.x * blockDim.x;
4794   for (; i<nnz; i+= grid_size) {
4795     PetscScalar sum = 0.0;
4796     for (PetscCount k=jmap[i]; k<jmap[i+1]; k++) sum += kv[perm[k]];
4797     a[i] = (imode == INSERT_VALUES? 0.0 : a[i]) + sum;
4798   }
4799 }
4800 
4801 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
4802 {
4803   Mat_SeqAIJ          *seq = (Mat_SeqAIJ*)A->data;
4804   Mat_SeqAIJCUSPARSE  *dev = (Mat_SeqAIJCUSPARSE*)A->spptr;
4805   PetscCount          Annz = seq->nz;
4806   PetscMemType        memtype;
4807   const PetscScalar   *v1 = v;
4808   PetscScalar         *Aa;
4809 
4810   PetscFunctionBegin;
4811   if (dev->use_extended_coo) {
4812     PetscCall(PetscGetMemType(v,&memtype));
4813     if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
4814       PetscCallCUDA(cudaMalloc((void**)&v1,seq->coo_n*sizeof(PetscScalar)));
4815       PetscCallCUDA(cudaMemcpy((void*)v1,v,seq->coo_n*sizeof(PetscScalar),cudaMemcpyHostToDevice));
4816     }
4817 
4818     if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A,&Aa));
4819     else PetscCall(MatSeqAIJCUSPARSEGetArray(A,&Aa));
4820 
4821     if (Annz) {
4822       MatAddCOOValues<<<(Annz+255)/256,256>>>(v1,Annz,dev->jmap_d,dev->perm_d,imode,Aa);
4823       PetscCallCUDA(cudaPeekAtLastError());
4824     }
4825 
4826     if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A,&Aa));
4827     else PetscCall(MatSeqAIJCUSPARSERestoreArray(A,&Aa));
4828 
4829     if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void*)v1));
4830   } else {
4831     PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(A,v,imode));
4832   }
4833   PetscFunctionReturn(0);
4834 }
4835 
4836 /*@C
4837     MatSeqAIJCUSPARSEGetIJ - returns the device row storage i and j indices for MATSEQAIJCUSPARSE matrices.
4838 
4839    Not collective
4840 
4841     Input Parameters:
4842 +   A - the matrix
4843 -   compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form
4844 
4845     Output Parameters:
4846 +   ia - the CSR row pointers
4847 -   ja - the CSR column indices
4848 
4849     Level: developer
4850 
4851     Notes:
4852       When compressed is true, the CSR structure does not contain empty rows
4853 
4854 .seealso: `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()`
4855 @*/
4856 PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int** i, const int **j)
4857 {
4858   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4859   CsrMatrix          *csr;
4860   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
4861 
4862   PetscFunctionBegin;
4863   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4864   if (!i || !j) PetscFunctionReturn(0);
4865   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4866   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4867   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4868   PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4869   csr = (CsrMatrix*)cusp->mat->mat;
4870   if (i) {
4871     if (!compressed && a->compressedrow.use) { /* need full row offset */
4872       if (!cusp->rowoffsets_gpu) {
4873         cusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
4874         cusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
4875         PetscCall(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt)));
4876       }
4877       *i = cusp->rowoffsets_gpu->data().get();
4878     } else *i = csr->row_offsets->data().get();
4879   }
4880   if (j) *j = csr->column_indices->data().get();
4881   PetscFunctionReturn(0);
4882 }
4883 
4884 /*@C
4885     MatSeqAIJCUSPARSERestoreIJ - restore the device row storage i and j indices obtained with MatSeqAIJCUSPARSEGetIJ()
4886 
4887    Not collective
4888 
4889     Input Parameters:
4890 +   A - the matrix
4891 -   compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form
4892 
4893     Output Parameters:
4894 +   ia - the CSR row pointers
4895 -   ja - the CSR column indices
4896 
4897     Level: developer
4898 
4899 .seealso: `MatSeqAIJCUSPARSEGetIJ()`
4900 @*/
4901 PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int** i, const int **j)
4902 {
4903   PetscFunctionBegin;
4904   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4905   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4906   if (i) *i = NULL;
4907   if (j) *j = NULL;
4908   PetscFunctionReturn(0);
4909 }
4910 
4911 /*@C
4912    MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
4913 
4914    Not Collective
4915 
4916    Input Parameter:
4917 .   A - a MATSEQAIJCUSPARSE matrix
4918 
4919    Output Parameter:
4920 .   a - pointer to the device data
4921 
4922    Level: developer
4923 
4924    Notes: may trigger host-device copies if up-to-date matrix data is on host
4925 
4926 .seealso: `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()`
4927 @*/
4928 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a)
4929 {
4930   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4931   CsrMatrix          *csr;
4932 
4933   PetscFunctionBegin;
4934   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4935   PetscValidPointer(a,2);
4936   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4937   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4938   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4939   PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4940   csr = (CsrMatrix*)cusp->mat->mat;
4941   PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4942   *a = csr->values->data().get();
4943   PetscFunctionReturn(0);
4944 }
4945 
4946 /*@C
4947    MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from MatSeqAIJCUSPARSEGetArrayRead()
4948 
4949    Not Collective
4950 
4951    Input Parameter:
4952 .   A - a MATSEQAIJCUSPARSE matrix
4953 
4954    Output Parameter:
4955 .   a - pointer to the device data
4956 
4957    Level: developer
4958 
4959 .seealso: `MatSeqAIJCUSPARSEGetArrayRead()`
4960 @*/
4961 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a)
4962 {
4963   PetscFunctionBegin;
4964   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4965   PetscValidPointer(a,2);
4966   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4967   *a = NULL;
4968   PetscFunctionReturn(0);
4969 }
4970 
4971 /*@C
4972    MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
4973 
4974    Not Collective
4975 
4976    Input Parameter:
4977 .   A - a MATSEQAIJCUSPARSE matrix
4978 
4979    Output Parameter:
4980 .   a - pointer to the device data
4981 
4982    Level: developer
4983 
4984    Notes: may trigger host-device copies if up-to-date matrix data is on host
4985 
4986 .seealso: `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()`
4987 @*/
4988 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a)
4989 {
4990   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4991   CsrMatrix          *csr;
4992 
4993   PetscFunctionBegin;
4994   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4995   PetscValidPointer(a,2);
4996   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4997   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4998   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4999   PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
5000   csr = (CsrMatrix*)cusp->mat->mat;
5001   PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
5002   *a = csr->values->data().get();
5003   A->offloadmask = PETSC_OFFLOAD_GPU;
5004   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE));
5005   PetscFunctionReturn(0);
5006 }
5007 /*@C
5008    MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from MatSeqAIJCUSPARSEGetArray()
5009 
5010    Not Collective
5011 
5012    Input Parameter:
5013 .   A - a MATSEQAIJCUSPARSE matrix
5014 
5015    Output Parameter:
5016 .   a - pointer to the device data
5017 
5018    Level: developer
5019 
5020 .seealso: `MatSeqAIJCUSPARSEGetArray()`
5021 @*/
5022 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a)
5023 {
5024   PetscFunctionBegin;
5025   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
5026   PetscValidPointer(a,2);
5027   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
5028   PetscCall(MatSeqAIJInvalidateDiagonal(A));
5029   PetscCall(PetscObjectStateIncrease((PetscObject)A));
5030   *a = NULL;
5031   PetscFunctionReturn(0);
5032 }
5033 
5034 /*@C
5035    MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
5036 
5037    Not Collective
5038 
5039    Input Parameter:
5040 .   A - a MATSEQAIJCUSPARSE matrix
5041 
5042    Output Parameter:
5043 .   a - pointer to the device data
5044 
5045    Level: developer
5046 
5047    Notes: does not trigger host-device copies and flags data validity on the GPU
5048 
5049 .seealso: `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()`
5050 @*/
5051 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a)
5052 {
5053   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
5054   CsrMatrix          *csr;
5055 
5056   PetscFunctionBegin;
5057   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
5058   PetscValidPointer(a,2);
5059   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
5060   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
5061   PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
5062   csr = (CsrMatrix*)cusp->mat->mat;
5063   PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
5064   *a = csr->values->data().get();
5065   A->offloadmask = PETSC_OFFLOAD_GPU;
5066   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE));
5067   PetscFunctionReturn(0);
5068 }
5069 
5070 /*@C
5071    MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from MatSeqAIJCUSPARSEGetArrayWrite()
5072 
5073    Not Collective
5074 
5075    Input Parameter:
5076 .   A - a MATSEQAIJCUSPARSE matrix
5077 
5078    Output Parameter:
5079 .   a - pointer to the device data
5080 
5081    Level: developer
5082 
5083 .seealso: `MatSeqAIJCUSPARSEGetArrayWrite()`
5084 @*/
5085 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a)
5086 {
5087   PetscFunctionBegin;
5088   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
5089   PetscValidPointer(a,2);
5090   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
5091   PetscCall(MatSeqAIJInvalidateDiagonal(A));
5092   PetscCall(PetscObjectStateIncrease((PetscObject)A));
5093   *a = NULL;
5094   PetscFunctionReturn(0);
5095 }
5096 
5097 struct IJCompare4
5098 {
5099   __host__ __device__
5100   inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
5101   {
5102     if (t1.get<0>() < t2.get<0>()) return true;
5103     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
5104     return false;
5105   }
5106 };
5107 
5108 struct Shift
5109 {
5110   int _shift;
5111 
5112   Shift(int shift) : _shift(shift) {}
5113   __host__ __device__
5114   inline int operator() (const int &c)
5115   {
5116     return c + _shift;
5117   }
5118 };
5119 
5120 /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */
5121 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
5122 {
5123   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c;
5124   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp;
5125   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
5126   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
5127   PetscInt                     Annz,Bnnz;
5128   cusparseStatus_t             stat;
5129   PetscInt                     i,m,n,zero = 0;
5130 
5131   PetscFunctionBegin;
5132   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
5133   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
5134   PetscValidPointer(C,4);
5135   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
5136   PetscCheckTypeName(B,MATSEQAIJCUSPARSE);
5137   PetscCheck(A->rmap->n == B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n);
5138   PetscCheck(reuse != MAT_INPLACE_MATRIX,PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
5139   PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
5140   PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
5141   if (reuse == MAT_INITIAL_MATRIX) {
5142     m     = A->rmap->n;
5143     n     = A->cmap->n + B->cmap->n;
5144     PetscCall(MatCreate(PETSC_COMM_SELF,C));
5145     PetscCall(MatSetSizes(*C,m,n,m,n));
5146     PetscCall(MatSetType(*C,MATSEQAIJCUSPARSE));
5147     c     = (Mat_SeqAIJ*)(*C)->data;
5148     Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
5149     Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
5150     Ccsr  = new CsrMatrix;
5151     Cmat->cprowIndices      = NULL;
5152     c->compressedrow.use    = PETSC_FALSE;
5153     c->compressedrow.nrows  = 0;
5154     c->compressedrow.i      = NULL;
5155     c->compressedrow.rindex = NULL;
5156     Ccusp->workVector       = NULL;
5157     Ccusp->nrows    = m;
5158     Ccusp->mat      = Cmat;
5159     Ccusp->mat->mat = Ccsr;
5160     Ccsr->num_rows  = m;
5161     Ccsr->num_cols  = n;
5162     PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
5163     PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
5164     PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
5165     PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar)));
5166     PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar)));
5167     PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar)));
5168     PetscCallCUDA(cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
5169     PetscCallCUDA(cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice));
5170     PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
5171     PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
5172     PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
5173     PetscCheck(Acusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
5174     PetscCheck(Bcusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
5175 
5176     Acsr = (CsrMatrix*)Acusp->mat->mat;
5177     Bcsr = (CsrMatrix*)Bcusp->mat->mat;
5178     Annz = (PetscInt)Acsr->column_indices->size();
5179     Bnnz = (PetscInt)Bcsr->column_indices->size();
5180     c->nz = Annz + Bnnz;
5181     Ccsr->row_offsets = new THRUSTINTARRAY32(m+1);
5182     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
5183     Ccsr->values = new THRUSTARRAY(c->nz);
5184     Ccsr->num_entries = c->nz;
5185     Ccusp->cooPerm = new THRUSTINTARRAY(c->nz);
5186     if (c->nz) {
5187       auto Acoo = new THRUSTINTARRAY32(Annz);
5188       auto Bcoo = new THRUSTINTARRAY32(Bnnz);
5189       auto Ccoo = new THRUSTINTARRAY32(c->nz);
5190       THRUSTINTARRAY32 *Aroff,*Broff;
5191 
5192       if (a->compressedrow.use) { /* need full row offset */
5193         if (!Acusp->rowoffsets_gpu) {
5194           Acusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
5195           Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
5196           PetscCall(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt)));
5197         }
5198         Aroff = Acusp->rowoffsets_gpu;
5199       } else Aroff = Acsr->row_offsets;
5200       if (b->compressedrow.use) { /* need full row offset */
5201         if (!Bcusp->rowoffsets_gpu) {
5202           Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
5203           Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
5204           PetscCall(PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt)));
5205         }
5206         Broff = Bcusp->rowoffsets_gpu;
5207       } else Broff = Bcsr->row_offsets;
5208       PetscCall(PetscLogGpuTimeBegin());
5209       stat = cusparseXcsr2coo(Acusp->handle,
5210                               Aroff->data().get(),
5211                               Annz,
5212                               m,
5213                               Acoo->data().get(),
5214                               CUSPARSE_INDEX_BASE_ZERO);PetscCallCUSPARSE(stat);
5215       stat = cusparseXcsr2coo(Bcusp->handle,
5216                               Broff->data().get(),
5217                               Bnnz,
5218                               m,
5219                               Bcoo->data().get(),
5220                               CUSPARSE_INDEX_BASE_ZERO);PetscCallCUSPARSE(stat);
5221       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
5222       auto Aperm = thrust::make_constant_iterator(1);
5223       auto Bperm = thrust::make_constant_iterator(0);
5224 #if PETSC_PKG_CUDA_VERSION_GE(10,0,0)
5225       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n));
5226       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n));
5227 #else
5228       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
5229       auto Bcib = Bcsr->column_indices->begin();
5230       auto Bcie = Bcsr->column_indices->end();
5231       thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n));
5232 #endif
5233       auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz);
5234       auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm));
5235       auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm));
5236       auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm));
5237       auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm));
5238       auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin()));
5239       auto p1 = Ccusp->cooPerm->begin();
5240       auto p2 = Ccusp->cooPerm->begin();
5241       thrust::advance(p2,Annz);
5242       PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4()));
5243 #if PETSC_PKG_CUDA_VERSION_LT(10,0,0)
5244       thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n));
5245 #endif
5246       auto cci = thrust::make_counting_iterator(zero);
5247       auto cce = thrust::make_counting_iterator(c->nz);
5248 #if 0 //Errors on SUMMIT cuda 11.1.0
5249       PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
5250 #else
5251       auto pred = thrust::identity<int>();
5252       PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred));
5253       PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred));
5254 #endif
5255       stat = cusparseXcoo2csr(Ccusp->handle,
5256                               Ccoo->data().get(),
5257                               c->nz,
5258                               m,
5259                               Ccsr->row_offsets->data().get(),
5260                               CUSPARSE_INDEX_BASE_ZERO);PetscCallCUSPARSE(stat);
5261       PetscCall(PetscLogGpuTimeEnd());
5262       delete wPerm;
5263       delete Acoo;
5264       delete Bcoo;
5265       delete Ccoo;
5266 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
5267       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries,
5268                                Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(),
5269                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
5270                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat);
5271 #endif
5272       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
5273         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
5274         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
5275         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
5276         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
5277         CsrMatrix *CcsrT = new CsrMatrix;
5278         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
5279         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
5280 
5281         (*C)->form_explicit_transpose = PETSC_TRUE;
5282         (*C)->transupdated = PETSC_TRUE;
5283         Ccusp->rowoffsets_gpu = NULL;
5284         CmatT->cprowIndices = NULL;
5285         CmatT->mat = CcsrT;
5286         CcsrT->num_rows = n;
5287         CcsrT->num_cols = m;
5288         CcsrT->num_entries = c->nz;
5289 
5290         CcsrT->row_offsets = new THRUSTINTARRAY32(n+1);
5291         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
5292         CcsrT->values = new THRUSTARRAY(c->nz);
5293 
5294         PetscCall(PetscLogGpuTimeBegin());
5295         auto rT = CcsrT->row_offsets->begin();
5296         if (AT) {
5297           rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT);
5298           thrust::advance(rT,-1);
5299         }
5300         if (BT) {
5301           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz));
5302           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz));
5303           thrust::copy(titb,tite,rT);
5304         }
5305         auto cT = CcsrT->column_indices->begin();
5306         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT);
5307         if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT);
5308         auto vT = CcsrT->values->begin();
5309         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
5310         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
5311         PetscCall(PetscLogGpuTimeEnd());
5312 
5313         PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr));
5314         PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO));
5315         PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
5316         PetscCallCUDA(cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar)));
5317         PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar)));
5318         PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar)));
5319         PetscCallCUDA(cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
5320         PetscCallCUDA(cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice));
5321         PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
5322 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
5323         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries,
5324                                  CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(),
5325                                  CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
5326                                  CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat);
5327 #endif
5328         Ccusp->matTranspose = CmatT;
5329       }
5330     }
5331 
5332     c->singlemalloc = PETSC_FALSE;
5333     c->free_a       = PETSC_TRUE;
5334     c->free_ij      = PETSC_TRUE;
5335     PetscCall(PetscMalloc1(m+1,&c->i));
5336     PetscCall(PetscMalloc1(c->nz,&c->j));
5337     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
5338       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
5339       THRUSTINTARRAY jj(Ccsr->column_indices->size());
5340       ii   = *Ccsr->row_offsets;
5341       jj   = *Ccsr->column_indices;
5342       PetscCallCUDA(cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
5343       PetscCallCUDA(cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
5344     } else {
5345       PetscCallCUDA(cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
5346       PetscCallCUDA(cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
5347     }
5348     PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt)));
5349     PetscCall(PetscMalloc1(m,&c->ilen));
5350     PetscCall(PetscMalloc1(m,&c->imax));
5351     c->maxnz = c->nz;
5352     c->nonzerorowcnt = 0;
5353     c->rmax = 0;
5354     for (i = 0; i < m; i++) {
5355       const PetscInt nn = c->i[i+1] - c->i[i];
5356       c->ilen[i] = c->imax[i] = nn;
5357       c->nonzerorowcnt += (PetscInt)!!nn;
5358       c->rmax = PetscMax(c->rmax,nn);
5359     }
5360     PetscCall(MatMarkDiagonal_SeqAIJ(*C));
5361     PetscCall(PetscMalloc1(c->nz,&c->a));
5362     (*C)->nonzerostate++;
5363     PetscCall(PetscLayoutSetUp((*C)->rmap));
5364     PetscCall(PetscLayoutSetUp((*C)->cmap));
5365     Ccusp->nonzerostate = (*C)->nonzerostate;
5366     (*C)->preallocated  = PETSC_TRUE;
5367   } else {
5368     PetscCheck((*C)->rmap->n == B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,(*C)->rmap->n,B->rmap->n);
5369     c = (Mat_SeqAIJ*)(*C)->data;
5370     if (c->nz) {
5371       Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
5372       PetscCheck(Ccusp->cooPerm,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm");
5373       PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
5374       PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate,PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate");
5375       PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
5376       PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
5377       PetscCheck(Acusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
5378       PetscCheck(Bcusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
5379       Acsr = (CsrMatrix*)Acusp->mat->mat;
5380       Bcsr = (CsrMatrix*)Bcusp->mat->mat;
5381       Ccsr = (CsrMatrix*)Ccusp->mat->mat;
5382       PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %" PetscInt_FMT " != %" PetscInt_FMT,Acsr->num_entries,(PetscInt)Acsr->values->size());
5383       PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %" PetscInt_FMT " != %" PetscInt_FMT,Bcsr->num_entries,(PetscInt)Bcsr->values->size());
5384       PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %" PetscInt_FMT " != %" PetscInt_FMT,Ccsr->num_entries,(PetscInt)Ccsr->values->size());
5385       PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries,PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT,Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries);
5386       PetscCheck(Ccusp->cooPerm->size() == Ccsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %" PetscInt_FMT " != %" PetscInt_FMT,(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size());
5387       auto pmid = Ccusp->cooPerm->begin();
5388       thrust::advance(pmid,Acsr->num_entries);
5389       PetscCall(PetscLogGpuTimeBegin());
5390       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(),
5391                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin())));
5392       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(),
5393                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
5394       thrust::for_each(zibait,zieait,VecCUDAEquals());
5395       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(),
5396                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
5397       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(),
5398                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end())));
5399       thrust::for_each(zibbit,ziebit,VecCUDAEquals());
5400       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE));
5401       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
5402         PetscCheck(Ccusp->matTranspose,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
5403         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
5404         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
5405         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
5406         CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat;
5407         auto vT = CcsrT->values->begin();
5408         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
5409         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
5410         (*C)->transupdated = PETSC_TRUE;
5411       }
5412       PetscCall(PetscLogGpuTimeEnd());
5413     }
5414   }
5415   PetscCall(PetscObjectStateIncrease((PetscObject)*C));
5416   (*C)->assembled     = PETSC_TRUE;
5417   (*C)->was_assembled = PETSC_FALSE;
5418   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
5419   PetscFunctionReturn(0);
5420 }
5421 
5422 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
5423 {
5424   bool              dmem;
5425   const PetscScalar *av;
5426 
5427   PetscFunctionBegin;
5428   dmem = isCudaMem(v);
5429   PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A,&av));
5430   if (n && idx) {
5431     THRUSTINTARRAY widx(n);
5432     widx.assign(idx,idx+n);
5433     PetscCall(PetscLogCpuToGpu(n*sizeof(PetscInt)));
5434 
5435     THRUSTARRAY *w = NULL;
5436     thrust::device_ptr<PetscScalar> dv;
5437     if (dmem) {
5438       dv = thrust::device_pointer_cast(v);
5439     } else {
5440       w = new THRUSTARRAY(n);
5441       dv = w->data();
5442     }
5443     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
5444 
5445     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv));
5446     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n));
5447     thrust::for_each(zibit,zieit,VecCUDAEquals());
5448     if (w) {
5449       PetscCallCUDA(cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost));
5450     }
5451     delete w;
5452   } else {
5453     PetscCallCUDA(cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost));
5454   }
5455   if (!dmem) PetscCall(PetscLogCpuToGpu(n*sizeof(PetscScalar)));
5456   PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A,&av));
5457   PetscFunctionReturn(0);
5458 }
5459