xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 261a78b473f90dfbc8ba911c87f1bd46aa32a06a)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
7 
8 #include <petscconf.h>
9 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
10 #include <../src/mat/impls/sbaij/seq/sbaij.h>
11 #include <../src/vec/vec/impls/dvecimpl.h>
12 #include <petsc/private/vecimpl.h>
13 #undef VecType
14 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
15 #include <thrust/adjacent_difference.h>
16 #include <thrust/async/for_each.h>
17 #include <thrust/iterator/constant_iterator.h>
18 #include <thrust/remove.h>
19 #include <thrust/sort.h>
20 #include <thrust/unique.h>
21 
22 const char *const MatCUSPARSEStorageFormats[]    = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
23 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
24   /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
25     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
26 
27   typedef enum {
28       CUSPARSE_MV_ALG_DEFAULT = 0,
29       CUSPARSE_COOMV_ALG      = 1,
30       CUSPARSE_CSRMV_ALG1     = 2,
31       CUSPARSE_CSRMV_ALG2     = 3
32   } cusparseSpMVAlg_t;
33 
34   typedef enum {
35       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
36       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
37       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
38       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
39       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
40       CUSPARSE_SPMM_ALG_DEFAULT = 0,
41       CUSPARSE_SPMM_COO_ALG1    = 1,
42       CUSPARSE_SPMM_COO_ALG2    = 2,
43       CUSPARSE_SPMM_COO_ALG3    = 3,
44       CUSPARSE_SPMM_COO_ALG4    = 5,
45       CUSPARSE_SPMM_CSR_ALG1    = 4,
46       CUSPARSE_SPMM_CSR_ALG2    = 6,
47   } cusparseSpMMAlg_t;
48 
49   typedef enum {
50       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
51       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
52   } cusparseCsr2CscAlg_t;
53   */
54   const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
55   const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
56   const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
57 #endif
58 
59 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
60 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
61 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
62 
63 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
64 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
65 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
66 
67 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
68 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
69 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
70 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
71 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
72 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure);
73 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar);
74 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
75 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
76 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
77 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
78 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
79 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
80 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
81 
82 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
83 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
84 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
85 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
86 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
87 
88 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
89 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool);
90 
91 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]);
92 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscCount,const PetscInt[],const PetscInt[]);
93 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode);
94 
95 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
96 {
97   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
98 
99   PetscFunctionBegin;
100   switch (op) {
101   case MAT_CUSPARSE_MULT:
102     cusparsestruct->format = format;
103     break;
104   case MAT_CUSPARSE_ALL:
105     cusparsestruct->format = format;
106     break;
107   default:
108     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
109   }
110   PetscFunctionReturn(0);
111 }
112 
113 /*@
114    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
115    operation. Only the MatMult operation can use different GPU storage formats
116    for MPIAIJCUSPARSE matrices.
117    Not Collective
118 
119    Input Parameters:
120 +  A - Matrix of type SEQAIJCUSPARSE
121 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
122 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
123 
124    Output Parameter:
125 
126    Level: intermediate
127 
128 .seealso: `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
129 @*/
130 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
131 {
132   PetscFunctionBegin;
133   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
134   PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));
135   PetscFunctionReturn(0);
136 }
137 
138 PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A,PetscBool use_cpu)
139 {
140   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
141 
142   PetscFunctionBegin;
143   cusparsestruct->use_cpu_solve = use_cpu;
144   PetscFunctionReturn(0);
145 }
146 
147 /*@
148    MatCUSPARSESetUseCPUSolve - Sets use CPU MatSolve.
149 
150    Input Parameters:
151 +  A - Matrix of type SEQAIJCUSPARSE
152 -  use_cpu - set flag for using the built-in CPU MatSolve
153 
154    Output Parameter:
155 
156    Notes:
157    The cuSparse LU solver currently computes the factors with the built-in CPU method
158    and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there.
159    This method to specify if the solve is done on the CPU or GPU (GPU is the default).
160 
161    Level: intermediate
162 
163 .seealso: `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
164 @*/
165 PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A,PetscBool use_cpu)
166 {
167   PetscFunctionBegin;
168   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
169   PetscTryMethod(A,"MatCUSPARSESetUseCPUSolve_C",(Mat,PetscBool),(A,use_cpu));
170   PetscFunctionReturn(0);
171 }
172 
173 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg)
174 {
175   PetscFunctionBegin;
176   switch (op) {
177     case MAT_FORM_EXPLICIT_TRANSPOSE:
178       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
179       if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE));
180       A->form_explicit_transpose = flg;
181       break;
182     default:
183       PetscCall(MatSetOption_SeqAIJ(A,op,flg));
184       break;
185   }
186   PetscFunctionReturn(0);
187 }
188 
189 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A);
190 
191 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
192 {
193   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
194   IS             isrow = b->row,iscol = b->col;
195   PetscBool      row_identity,col_identity;
196   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)B->spptr;
197 
198   PetscFunctionBegin;
199   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
200   PetscCall(MatLUFactorNumeric_SeqAIJ(B,A,info));
201   B->offloadmask = PETSC_OFFLOAD_CPU;
202   /* determine which version of MatSolve needs to be used. */
203   PetscCall(ISIdentity(isrow,&row_identity));
204   PetscCall(ISIdentity(iscol,&col_identity));
205 
206   if (!cusparsestruct->use_cpu_solve) {
207     if (row_identity && col_identity) {
208       B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
209       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
210     } else {
211       B->ops->solve = MatSolve_SeqAIJCUSPARSE;
212       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
213     }
214   }
215   B->ops->matsolve = NULL;
216   B->ops->matsolvetranspose = NULL;
217 
218   /* get the triangular factors */
219   if (!cusparsestruct->use_cpu_solve) {
220     PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B));
221   }
222   PetscFunctionReturn(0);
223 }
224 
225 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
226 {
227   MatCUSPARSEStorageFormat format;
228   PetscBool                flg;
229   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
230 
231   PetscFunctionBegin;
232   PetscOptionsHeadBegin(PetscOptionsObject,"SeqAIJCUSPARSE options");
233   if (A->factortype == MAT_FACTOR_NONE) {
234     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
235                                "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg));
236     if (flg) PetscCall(MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format));
237 
238     PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
239                                "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg));
240     if (flg) PetscCall(MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format));
241     PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve","Use CPU (I)LU solve","MatCUSPARSESetUseCPUSolve",cusparsestruct->use_cpu_solve,&cusparsestruct->use_cpu_solve,&flg));
242     if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A,cusparsestruct->use_cpu_solve));
243 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
244     PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
245                                "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg));
246     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
247 #if CUSPARSE_VERSION > 11301
248     PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
249 #else
250     PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
251 #endif
252     PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
253                                "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg));
254     PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
255 
256     PetscCall(PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
257                                "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg));
258     PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
259    #endif
260   }
261   PetscOptionsHeadEnd();
262   PetscFunctionReturn(0);
263 }
264 
265 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
266 {
267   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
268 
269   PetscFunctionBegin;
270   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
271   PetscCall(MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info));
272   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
273   PetscFunctionReturn(0);
274 }
275 
276 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
277 {
278   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
279 
280   PetscFunctionBegin;
281   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
282   PetscCall(MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info));
283   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
284   PetscFunctionReturn(0);
285 }
286 
287 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
288 {
289   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
290 
291   PetscFunctionBegin;
292   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
293   PetscCall(MatICCFactorSymbolic_SeqAIJ(B,A,perm,info));
294   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
295   PetscFunctionReturn(0);
296 }
297 
298 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
299 {
300   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
301 
302   PetscFunctionBegin;
303   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
304   PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info));
305   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
306   PetscFunctionReturn(0);
307 }
308 
309 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
310 {
311   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
312   PetscInt                          n = A->rmap->n;
313   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
314   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
315   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
316   const MatScalar                   *aa = a->a,*v;
317   PetscInt                          *AiLo, *AjLo;
318   PetscInt                          i,nz, nzLower, offset, rowOffset;
319 
320   PetscFunctionBegin;
321   if (!n) PetscFunctionReturn(0);
322   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
323     try {
324       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
325       nzLower=n+ai[n]-ai[1];
326       if (!loTriFactor) {
327         PetscScalar                       *AALo;
328 
329         PetscCallCUDA(cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar)));
330 
331         /* Allocate Space for the lower triangular matrix */
332         PetscCallCUDA(cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt)));
333         PetscCallCUDA(cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt)));
334 
335         /* Fill the lower triangular matrix */
336         AiLo[0]  = (PetscInt) 0;
337         AiLo[n]  = nzLower;
338         AjLo[0]  = (PetscInt) 0;
339         AALo[0]  = (MatScalar) 1.0;
340         v        = aa;
341         vi       = aj;
342         offset   = 1;
343         rowOffset= 1;
344         for (i=1; i<n; i++) {
345           nz = ai[i+1] - ai[i];
346           /* additional 1 for the term on the diagonal */
347           AiLo[i]    = rowOffset;
348           rowOffset += nz+1;
349 
350           PetscCall(PetscArraycpy(&(AjLo[offset]), vi, nz));
351           PetscCall(PetscArraycpy(&(AALo[offset]), v, nz));
352 
353           offset      += nz;
354           AjLo[offset] = (PetscInt) i;
355           AALo[offset] = (MatScalar) 1.0;
356           offset      += 1;
357 
358           v  += nz;
359           vi += nz;
360         }
361 
362         /* allocate space for the triangular factor information */
363         PetscCall(PetscNew(&loTriFactor));
364         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
365         /* Create the matrix description */
366         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
367         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
368        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
369         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
370        #else
371         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
372        #endif
373         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER));
374         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
375 
376         /* set the operation */
377         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
378 
379         /* set the matrix */
380         loTriFactor->csrMat = new CsrMatrix;
381         loTriFactor->csrMat->num_rows = n;
382         loTriFactor->csrMat->num_cols = n;
383         loTriFactor->csrMat->num_entries = nzLower;
384 
385         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
386         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
387 
388         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
389         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
390 
391         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
392         loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
393 
394         /* Create the solve analysis information */
395         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
396         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
397       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
398         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
399                                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
400                                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
401                                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
402                                                &loTriFactor->solveBufferSize));
403         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize));
404       #endif
405 
406         /* perform the solve analysis */
407         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
408                                          loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
409                                          loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
410                                          loTriFactor->csrMat->column_indices->data().get(),
411                                          #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
412                                          loTriFactor->solveInfo,
413                                          loTriFactor->solvePolicy, loTriFactor->solveBuffer));
414                                          #else
415                                          loTriFactor->solveInfo));
416                                          #endif
417         PetscCallCUDA(WaitForCUDA());
418         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
419 
420         /* assign the pointer */
421         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
422         loTriFactor->AA_h = AALo;
423         PetscCallCUDA(cudaFreeHost(AiLo));
424         PetscCallCUDA(cudaFreeHost(AjLo));
425         PetscCall(PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar)));
426       } else { /* update values only */
427         if (!loTriFactor->AA_h) {
428           PetscCallCUDA(cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar)));
429         }
430         /* Fill the lower triangular matrix */
431         loTriFactor->AA_h[0]  = 1.0;
432         v        = aa;
433         vi       = aj;
434         offset   = 1;
435         for (i=1; i<n; i++) {
436           nz = ai[i+1] - ai[i];
437           PetscCall(PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz));
438           offset      += nz;
439           loTriFactor->AA_h[offset] = 1.0;
440           offset      += 1;
441           v  += nz;
442         }
443         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower);
444         PetscCall(PetscLogCpuToGpu(nzLower*sizeof(PetscScalar)));
445       }
446     } catch(char *ex) {
447       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
448     }
449   }
450   PetscFunctionReturn(0);
451 }
452 
453 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
454 {
455   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
456   PetscInt                          n = A->rmap->n;
457   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
458   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
459   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
460   const MatScalar                   *aa = a->a,*v;
461   PetscInt                          *AiUp, *AjUp;
462   PetscInt                          i,nz, nzUpper, offset;
463 
464   PetscFunctionBegin;
465   if (!n) PetscFunctionReturn(0);
466   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
467     try {
468       /* next, figure out the number of nonzeros in the upper triangular matrix. */
469       nzUpper = adiag[0]-adiag[n];
470       if (!upTriFactor) {
471         PetscScalar *AAUp;
472 
473         PetscCallCUDA(cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar)));
474 
475         /* Allocate Space for the upper triangular matrix */
476         PetscCallCUDA(cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt)));
477         PetscCallCUDA(cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt)));
478 
479         /* Fill the upper triangular matrix */
480         AiUp[0]=(PetscInt) 0;
481         AiUp[n]=nzUpper;
482         offset = nzUpper;
483         for (i=n-1; i>=0; i--) {
484           v  = aa + adiag[i+1] + 1;
485           vi = aj + adiag[i+1] + 1;
486 
487           /* number of elements NOT on the diagonal */
488           nz = adiag[i] - adiag[i+1]-1;
489 
490           /* decrement the offset */
491           offset -= (nz+1);
492 
493           /* first, set the diagonal elements */
494           AjUp[offset] = (PetscInt) i;
495           AAUp[offset] = (MatScalar)1./v[nz];
496           AiUp[i]      = AiUp[i+1] - (nz+1);
497 
498           PetscCall(PetscArraycpy(&(AjUp[offset+1]), vi, nz));
499           PetscCall(PetscArraycpy(&(AAUp[offset+1]), v, nz));
500         }
501 
502         /* allocate space for the triangular factor information */
503         PetscCall(PetscNew(&upTriFactor));
504         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
505 
506         /* Create the matrix description */
507         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
508         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
509        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
510         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
511        #else
512         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
513        #endif
514         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
515         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
516 
517         /* set the operation */
518         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
519 
520         /* set the matrix */
521         upTriFactor->csrMat = new CsrMatrix;
522         upTriFactor->csrMat->num_rows = n;
523         upTriFactor->csrMat->num_cols = n;
524         upTriFactor->csrMat->num_entries = nzUpper;
525 
526         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
527         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
528 
529         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
530         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
531 
532         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
533         upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
534 
535         /* Create the solve analysis information */
536         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
537         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
538       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
539         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
540                                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
541                                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
542                                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
543                                                &upTriFactor->solveBufferSize));
544         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize));
545       #endif
546 
547         /* perform the solve analysis */
548         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
549                                          upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
550                                          upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
551                                          upTriFactor->csrMat->column_indices->data().get(),
552                                          #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
553                                          upTriFactor->solveInfo,
554                                          upTriFactor->solvePolicy, upTriFactor->solveBuffer));
555                                          #else
556                                          upTriFactor->solveInfo));
557                                          #endif
558         PetscCallCUDA(WaitForCUDA());
559         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
560 
561         /* assign the pointer */
562         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
563         upTriFactor->AA_h = AAUp;
564         PetscCallCUDA(cudaFreeHost(AiUp));
565         PetscCallCUDA(cudaFreeHost(AjUp));
566         PetscCall(PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar)));
567       } else {
568         if (!upTriFactor->AA_h) {
569           PetscCallCUDA(cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar)));
570         }
571         /* Fill the upper triangular matrix */
572         offset = nzUpper;
573         for (i=n-1; i>=0; i--) {
574           v  = aa + adiag[i+1] + 1;
575 
576           /* number of elements NOT on the diagonal */
577           nz = adiag[i] - adiag[i+1]-1;
578 
579           /* decrement the offset */
580           offset -= (nz+1);
581 
582           /* first, set the diagonal elements */
583           upTriFactor->AA_h[offset] = 1./v[nz];
584           PetscCall(PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz));
585         }
586         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper);
587         PetscCall(PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar)));
588       }
589     } catch(char *ex) {
590       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
591     }
592   }
593   PetscFunctionReturn(0);
594 }
595 
596 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
597 {
598   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
599   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
600   IS                           isrow = a->row,iscol = a->icol;
601   PetscBool                    row_identity,col_identity;
602   PetscInt                     n = A->rmap->n;
603 
604   PetscFunctionBegin;
605   PetscCheck(cusparseTriFactors,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
606   PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A));
607   PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A));
608 
609   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
610   cusparseTriFactors->nnz=a->nz;
611 
612   A->offloadmask = PETSC_OFFLOAD_BOTH;
613   /* lower triangular indices */
614   PetscCall(ISIdentity(isrow,&row_identity));
615   if (!row_identity && !cusparseTriFactors->rpermIndices) {
616     const PetscInt *r;
617 
618     PetscCall(ISGetIndices(isrow,&r));
619     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
620     cusparseTriFactors->rpermIndices->assign(r, r+n);
621     PetscCall(ISRestoreIndices(isrow,&r));
622     PetscCall(PetscLogCpuToGpu(n*sizeof(PetscInt)));
623   }
624 
625   /* upper triangular indices */
626   PetscCall(ISIdentity(iscol,&col_identity));
627   if (!col_identity && !cusparseTriFactors->cpermIndices) {
628     const PetscInt *c;
629 
630     PetscCall(ISGetIndices(iscol,&c));
631     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
632     cusparseTriFactors->cpermIndices->assign(c, c+n);
633     PetscCall(ISRestoreIndices(iscol,&c));
634     PetscCall(PetscLogCpuToGpu(n*sizeof(PetscInt)));
635   }
636   PetscFunctionReturn(0);
637 }
638 
639 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
640 {
641   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
642   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
643   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
644   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
645   PetscInt                          *AiUp, *AjUp;
646   PetscScalar                       *AAUp;
647   PetscScalar                       *AALo;
648   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
649   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
650   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
651   const MatScalar                   *aa = b->a,*v;
652 
653   PetscFunctionBegin;
654   if (!n) PetscFunctionReturn(0);
655   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
656     try {
657       PetscCallCUDA(cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar)));
658       PetscCallCUDA(cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar)));
659       if (!upTriFactor && !loTriFactor) {
660         /* Allocate Space for the upper triangular matrix */
661         PetscCallCUDA(cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt)));
662         PetscCallCUDA(cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt)));
663 
664         /* Fill the upper triangular matrix */
665         AiUp[0]=(PetscInt) 0;
666         AiUp[n]=nzUpper;
667         offset = 0;
668         for (i=0; i<n; i++) {
669           /* set the pointers */
670           v  = aa + ai[i];
671           vj = aj + ai[i];
672           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
673 
674           /* first, set the diagonal elements */
675           AjUp[offset] = (PetscInt) i;
676           AAUp[offset] = (MatScalar)1.0/v[nz];
677           AiUp[i]      = offset;
678           AALo[offset] = (MatScalar)1.0/v[nz];
679 
680           offset+=1;
681           if (nz>0) {
682             PetscCall(PetscArraycpy(&(AjUp[offset]), vj, nz));
683             PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz));
684             for (j=offset; j<offset+nz; j++) {
685               AAUp[j] = -AAUp[j];
686               AALo[j] = AAUp[j]/v[nz];
687             }
688             offset+=nz;
689           }
690         }
691 
692         /* allocate space for the triangular factor information */
693         PetscCall(PetscNew(&upTriFactor));
694         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
695 
696         /* Create the matrix description */
697         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
698         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
699        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
700         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
701        #else
702         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
703        #endif
704         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
705         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
706 
707         /* set the matrix */
708         upTriFactor->csrMat = new CsrMatrix;
709         upTriFactor->csrMat->num_rows = A->rmap->n;
710         upTriFactor->csrMat->num_cols = A->cmap->n;
711         upTriFactor->csrMat->num_entries = a->nz;
712 
713         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
714         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
715 
716         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
717         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
718 
719         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
720         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
721 
722         /* set the operation */
723         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
724 
725         /* Create the solve analysis information */
726         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
727         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
728       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
729         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
730                                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
731                                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
732                                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
733                                                &upTriFactor->solveBufferSize));
734         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize));
735       #endif
736 
737         /* perform the solve analysis */
738         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
739                                          upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
740                                          upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
741                                          upTriFactor->csrMat->column_indices->data().get(),
742                                          #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
743                                          upTriFactor->solveInfo,
744                                          upTriFactor->solvePolicy, upTriFactor->solveBuffer));
745                                          #else
746                                          upTriFactor->solveInfo));
747                                          #endif
748         PetscCallCUDA(WaitForCUDA());
749         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
750 
751         /* assign the pointer */
752         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
753 
754         /* allocate space for the triangular factor information */
755         PetscCall(PetscNew(&loTriFactor));
756         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
757 
758         /* Create the matrix description */
759         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
760         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
761        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
762         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
763        #else
764         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
765        #endif
766         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
767         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
768 
769         /* set the operation */
770         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
771 
772         /* set the matrix */
773         loTriFactor->csrMat = new CsrMatrix;
774         loTriFactor->csrMat->num_rows = A->rmap->n;
775         loTriFactor->csrMat->num_cols = A->cmap->n;
776         loTriFactor->csrMat->num_entries = a->nz;
777 
778         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
779         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
780 
781         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
782         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
783 
784         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
785         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
786 
787         /* Create the solve analysis information */
788         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
789         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
790       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
791         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
792                                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
793                                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
794                                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
795                                                &loTriFactor->solveBufferSize));
796         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize));
797       #endif
798 
799         /* perform the solve analysis */
800         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
801                                          loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
802                                          loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
803                                          loTriFactor->csrMat->column_indices->data().get(),
804                                          #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
805                                          loTriFactor->solveInfo,
806                                          loTriFactor->solvePolicy, loTriFactor->solveBuffer));
807                                          #else
808                                          loTriFactor->solveInfo));
809                                          #endif
810         PetscCallCUDA(WaitForCUDA());
811         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
812 
813         /* assign the pointer */
814         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
815 
816         PetscCall(PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar))));
817         PetscCallCUDA(cudaFreeHost(AiUp));
818         PetscCallCUDA(cudaFreeHost(AjUp));
819       } else {
820         /* Fill the upper triangular matrix */
821         offset = 0;
822         for (i=0; i<n; i++) {
823           /* set the pointers */
824           v  = aa + ai[i];
825           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
826 
827           /* first, set the diagonal elements */
828           AAUp[offset] = 1.0/v[nz];
829           AALo[offset] = 1.0/v[nz];
830 
831           offset+=1;
832           if (nz>0) {
833             PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz));
834             for (j=offset; j<offset+nz; j++) {
835               AAUp[j] = -AAUp[j];
836               AALo[j] = AAUp[j]/v[nz];
837             }
838             offset+=nz;
839           }
840         }
841         PetscCheck(upTriFactor,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
842         PetscCheck(loTriFactor,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
843         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
844         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
845         PetscCall(PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar)));
846       }
847       PetscCallCUDA(cudaFreeHost(AAUp));
848       PetscCallCUDA(cudaFreeHost(AALo));
849     } catch(char *ex) {
850       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
851     }
852   }
853   PetscFunctionReturn(0);
854 }
855 
856 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
857 {
858   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
859   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
860   IS                           ip = a->row;
861   PetscBool                    perm_identity;
862   PetscInt                     n = A->rmap->n;
863 
864   PetscFunctionBegin;
865   PetscCheck(cusparseTriFactors,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
866   PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A));
867   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
868   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
869 
870   A->offloadmask = PETSC_OFFLOAD_BOTH;
871 
872   /* lower triangular indices */
873   PetscCall(ISIdentity(ip,&perm_identity));
874   if (!perm_identity) {
875     IS             iip;
876     const PetscInt *irip,*rip;
877 
878     PetscCall(ISInvertPermutation(ip,PETSC_DECIDE,&iip));
879     PetscCall(ISGetIndices(iip,&irip));
880     PetscCall(ISGetIndices(ip,&rip));
881     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
882     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
883     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
884     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
885     PetscCall(ISRestoreIndices(iip,&irip));
886     PetscCall(ISDestroy(&iip));
887     PetscCall(ISRestoreIndices(ip,&rip));
888     PetscCall(PetscLogCpuToGpu(2.*n*sizeof(PetscInt)));
889   }
890   PetscFunctionReturn(0);
891 }
892 
893 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
894 {
895   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
896   IS             ip = b->row;
897   PetscBool      perm_identity;
898 
899   PetscFunctionBegin;
900   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
901   PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B,A,info));
902   B->offloadmask = PETSC_OFFLOAD_CPU;
903   /* determine which version of MatSolve needs to be used. */
904   PetscCall(ISIdentity(ip,&perm_identity));
905   if (perm_identity) {
906     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
907     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
908     B->ops->matsolve = NULL;
909     B->ops->matsolvetranspose = NULL;
910   } else {
911     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
912     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
913     B->ops->matsolve = NULL;
914     B->ops->matsolvetranspose = NULL;
915   }
916 
917   /* get the triangular factors */
918   PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B));
919   PetscFunctionReturn(0);
920 }
921 
922 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
923 {
924   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
925   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
926   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
927   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
928   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
929   cusparseIndexBase_t               indexBase;
930   cusparseMatrixType_t              matrixType;
931   cusparseFillMode_t                fillMode;
932   cusparseDiagType_t                diagType;
933 
934   PetscFunctionBegin;
935   /* allocate space for the transpose of the lower triangular factor */
936   PetscCall(PetscNew(&loTriFactorT));
937   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
938 
939   /* set the matrix descriptors of the lower triangular factor */
940   matrixType = cusparseGetMatType(loTriFactor->descr);
941   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
942   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
943     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
944   diagType = cusparseGetMatDiagType(loTriFactor->descr);
945 
946   /* Create the matrix description */
947   PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr));
948   PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase));
949   PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType));
950   PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode));
951   PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType));
952 
953   /* set the operation */
954   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
955 
956   /* allocate GPU space for the CSC of the lower triangular factor*/
957   loTriFactorT->csrMat = new CsrMatrix;
958   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
959   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
960   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
961   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
962   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
963   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
964 
965   /* compute the transpose of the lower triangular factor, i.e. the CSC */
966 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
967   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
968                                                loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
969                                                loTriFactor->csrMat->values->data().get(),
970                                                loTriFactor->csrMat->row_offsets->data().get(),
971                                                loTriFactor->csrMat->column_indices->data().get(),
972                                                loTriFactorT->csrMat->values->data().get(),
973                                                loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
974                                                CUSPARSE_ACTION_NUMERIC,indexBase,
975                                                CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize));
976   PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize));
977 #endif
978 
979   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
980   PetscCallCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
981                                   loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
982                                   loTriFactor->csrMat->values->data().get(),
983                                   loTriFactor->csrMat->row_offsets->data().get(),
984                                   loTriFactor->csrMat->column_indices->data().get(),
985                                   loTriFactorT->csrMat->values->data().get(),
986                                   #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
987                                   loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
988                                   CUSPARSE_ACTION_NUMERIC, indexBase,
989                                   CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer));
990                                   #else
991                                   loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
992                                   CUSPARSE_ACTION_NUMERIC, indexBase));
993                                   #endif
994   PetscCallCUDA(WaitForCUDA());
995   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
996 
997   /* Create the solve analysis information */
998   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
999   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactorT->solveInfo));
1000 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1001   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
1002                                          loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1003                                          loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1004                                          loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
1005                                          &loTriFactorT->solveBufferSize));
1006   PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize));
1007 #endif
1008 
1009   /* perform the solve analysis */
1010   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
1011                                    loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1012                                    loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1013                                    loTriFactorT->csrMat->column_indices->data().get(),
1014                                    #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1015                                    loTriFactorT->solveInfo,
1016                                    loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1017                                    #else
1018                                    loTriFactorT->solveInfo));
1019                                    #endif
1020   PetscCallCUDA(WaitForCUDA());
1021   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
1022 
1023   /* assign the pointer */
1024   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1025 
1026   /*********************************************/
1027   /* Now the Transpose of the Upper Tri Factor */
1028   /*********************************************/
1029 
1030   /* allocate space for the transpose of the upper triangular factor */
1031   PetscCall(PetscNew(&upTriFactorT));
1032   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1033 
1034   /* set the matrix descriptors of the upper triangular factor */
1035   matrixType = cusparseGetMatType(upTriFactor->descr);
1036   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1037   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1038     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1039   diagType = cusparseGetMatDiagType(upTriFactor->descr);
1040 
1041   /* Create the matrix description */
1042   PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr));
1043   PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase));
1044   PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType));
1045   PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode));
1046   PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType));
1047 
1048   /* set the operation */
1049   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1050 
1051   /* allocate GPU space for the CSC of the upper triangular factor*/
1052   upTriFactorT->csrMat = new CsrMatrix;
1053   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1054   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1055   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1056   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1057   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1058   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1059 
1060   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1061 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1062   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1063                                                upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1064                                                upTriFactor->csrMat->values->data().get(),
1065                                                upTriFactor->csrMat->row_offsets->data().get(),
1066                                                upTriFactor->csrMat->column_indices->data().get(),
1067                                                upTriFactorT->csrMat->values->data().get(),
1068                                                upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1069                                                CUSPARSE_ACTION_NUMERIC,indexBase,
1070                                                CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize));
1071   PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize));
1072 #endif
1073 
1074   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
1075   PetscCallCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1076                                   upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1077                                   upTriFactor->csrMat->values->data().get(),
1078                                   upTriFactor->csrMat->row_offsets->data().get(),
1079                                   upTriFactor->csrMat->column_indices->data().get(),
1080                                   upTriFactorT->csrMat->values->data().get(),
1081                                   #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1082                                   upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1083                                   CUSPARSE_ACTION_NUMERIC, indexBase,
1084                                   CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer));
1085                                   #else
1086                                   upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1087                                  CUSPARSE_ACTION_NUMERIC, indexBase));
1088                                  #endif
1089 
1090   PetscCallCUDA(WaitForCUDA());
1091   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
1092 
1093   /* Create the solve analysis information */
1094   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
1095   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactorT->solveInfo));
1096   #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1097   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1098                                          upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1099                                          upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1100                                          upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
1101                                          &upTriFactorT->solveBufferSize));
1102   PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize));
1103   #endif
1104 
1105   /* perform the solve analysis */
1106   /* christ, would it have killed you to put this stuff in a function????????? */
1107   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1108                                    upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1109                                    upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1110                                    upTriFactorT->csrMat->column_indices->data().get(),
1111                                    #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1112                                    upTriFactorT->solveInfo,
1113                                    upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1114                                    #else
1115                                    upTriFactorT->solveInfo));
1116                                    #endif
1117 
1118   PetscCallCUDA(WaitForCUDA());
1119   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
1120 
1121   /* assign the pointer */
1122   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1123   PetscFunctionReturn(0);
1124 }
1125 
1126 struct PetscScalarToPetscInt
1127 {
1128   __host__ __device__
1129   PetscInt operator()(PetscScalar s)
1130   {
1131     return (PetscInt)PetscRealPart(s);
1132   }
1133 };
1134 
1135 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A)
1136 {
1137   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1138   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1139   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1140   cusparseStatus_t             stat;
1141   cusparseIndexBase_t          indexBase;
1142 
1143   PetscFunctionBegin;
1144   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1145   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1146   PetscCheck(matstruct,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing mat struct");
1147   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1148   PetscCheck(!A->transupdated || matstructT,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing matTranspose struct");
1149   if (A->transupdated) PetscFunctionReturn(0);
1150   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
1151   PetscCall(PetscLogGpuTimeBegin());
1152   if (cusparsestruct->format != MAT_CUSPARSE_CSR) {
1153     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE));
1154   }
1155   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1156     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1157     PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr));
1158     indexBase = cusparseGetMatIndexBase(matstruct->descr);
1159     PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase));
1160     PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
1161 
1162     /* set alpha and beta */
1163     PetscCallCUDA(cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar)));
1164     PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar)));
1165     PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar)));
1166     PetscCallCUDA(cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
1167     PetscCallCUDA(cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice));
1168     PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
1169 
1170     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1171       CsrMatrix *matrixT = new CsrMatrix;
1172       matstructT->mat = matrixT;
1173       matrixT->num_rows = A->cmap->n;
1174       matrixT->num_cols = A->rmap->n;
1175       matrixT->num_entries = a->nz;
1176       matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1177       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1178       matrixT->values = new THRUSTARRAY(a->nz);
1179 
1180       if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); }
1181       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
1182 
1183      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1184       #if PETSC_PKG_CUDA_VERSION_GE(11,2,1)
1185         stat = cusparseCreateCsr(&matstructT->matDescr,
1186                                matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1187                                matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1188                                matrixT->values->data().get(),
1189                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1190                                indexBase,cusparse_scalartype);PetscCallCUSPARSE(stat);
1191       #else
1192         /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1,
1193            see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1
1194 
1195            I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set
1196            it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2,
1197            when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly.
1198         */
1199         if (matrixT->num_entries) {
1200           stat = cusparseCreateCsr(&matstructT->matDescr,
1201                                  matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1202                                  matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1203                                  matrixT->values->data().get(),
1204                                  CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I,
1205                                  indexBase,cusparse_scalartype);PetscCallCUSPARSE(stat);
1206 
1207         } else {
1208           matstructT->matDescr = NULL;
1209           matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
1210         }
1211       #endif
1212      #endif
1213     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1214    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1215       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1216    #else
1217       CsrMatrix *temp  = new CsrMatrix;
1218       CsrMatrix *tempT = new CsrMatrix;
1219       /* First convert HYB to CSR */
1220       temp->num_rows = A->rmap->n;
1221       temp->num_cols = A->cmap->n;
1222       temp->num_entries = a->nz;
1223       temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1224       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1225       temp->values = new THRUSTARRAY(a->nz);
1226 
1227       stat = cusparse_hyb2csr(cusparsestruct->handle,
1228                               matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1229                               temp->values->data().get(),
1230                               temp->row_offsets->data().get(),
1231                               temp->column_indices->data().get());PetscCallCUSPARSE(stat);
1232 
1233       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1234       tempT->num_rows = A->rmap->n;
1235       tempT->num_cols = A->cmap->n;
1236       tempT->num_entries = a->nz;
1237       tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1238       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1239       tempT->values = new THRUSTARRAY(a->nz);
1240 
1241       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1242                               temp->num_cols, temp->num_entries,
1243                               temp->values->data().get(),
1244                               temp->row_offsets->data().get(),
1245                               temp->column_indices->data().get(),
1246                               tempT->values->data().get(),
1247                               tempT->column_indices->data().get(),
1248                               tempT->row_offsets->data().get(),
1249                               CUSPARSE_ACTION_NUMERIC, indexBase);PetscCallCUSPARSE(stat);
1250 
1251       /* Last, convert CSC to HYB */
1252       cusparseHybMat_t hybMat;
1253       PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
1254       cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1255         CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1256       stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1257                               matstructT->descr, tempT->values->data().get(),
1258                               tempT->row_offsets->data().get(),
1259                               tempT->column_indices->data().get(),
1260                               hybMat, 0, partition);PetscCallCUSPARSE(stat);
1261 
1262       /* assign the pointer */
1263       matstructT->mat = hybMat;
1264       A->transupdated = PETSC_TRUE;
1265       /* delete temporaries */
1266       if (tempT) {
1267         if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1268         if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1269         if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1270         delete (CsrMatrix*) tempT;
1271       }
1272       if (temp) {
1273         if (temp->values) delete (THRUSTARRAY*) temp->values;
1274         if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1275         if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1276         delete (CsrMatrix*) temp;
1277       }
1278      #endif
1279     }
1280   }
1281   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1282     CsrMatrix *matrix  = (CsrMatrix*)matstruct->mat;
1283     CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat;
1284     PetscCheck(matrix,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix");
1285     PetscCheck(matrix->row_offsets,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix rows");
1286     PetscCheck(matrix->column_indices,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix cols");
1287     PetscCheck(matrix->values,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix values");
1288     PetscCheck(matrixT,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT");
1289     PetscCheck(matrixT->row_offsets,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT rows");
1290     PetscCheck(matrixT->column_indices,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT cols");
1291     PetscCheck(matrixT->values,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT values");
1292     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1293       cusparsestruct->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
1294       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
1295       PetscCall(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt)));
1296     }
1297     if (!cusparsestruct->csr2csc_i) {
1298       THRUSTARRAY csr2csc_a(matrix->num_entries);
1299       PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1300 
1301       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1302      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1303       void   *csr2cscBuffer;
1304       size_t csr2cscBufferSize;
1305       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1306                                            A->cmap->n, matrix->num_entries,
1307                                            matrix->values->data().get(),
1308                                            cusparsestruct->rowoffsets_gpu->data().get(),
1309                                            matrix->column_indices->data().get(),
1310                                            matrixT->values->data().get(),
1311                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1312                                            CUSPARSE_ACTION_NUMERIC,indexBase,
1313                                            cusparsestruct->csr2cscAlg, &csr2cscBufferSize);PetscCallCUSPARSE(stat);
1314       PetscCallCUDA(cudaMalloc(&csr2cscBuffer,csr2cscBufferSize));
1315      #endif
1316 
1317       if (matrix->num_entries) {
1318         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
1319            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
1320            I checked every parameters and they were just fine. I have no clue why cusparse complains.
1321 
1322            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
1323            should be filled with indexBase. So I just take a shortcut here.
1324         */
1325         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1326                               A->cmap->n,matrix->num_entries,
1327                               csr2csc_a.data().get(),
1328                               cusparsestruct->rowoffsets_gpu->data().get(),
1329                               matrix->column_indices->data().get(),
1330                               matrixT->values->data().get(),
1331                              #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1332                               matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1333                               CUSPARSE_ACTION_NUMERIC,indexBase,
1334                               cusparsestruct->csr2cscAlg, csr2cscBuffer);PetscCallCUSPARSE(stat);
1335                              #else
1336                               matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1337                               CUSPARSE_ACTION_NUMERIC, indexBase);PetscCallCUSPARSE(stat);
1338                              #endif
1339       } else {
1340         matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
1341       }
1342 
1343       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1344       PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt()));
1345      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1346       PetscCallCUDA(cudaFree(csr2cscBuffer));
1347      #endif
1348     }
1349     PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()),
1350                                                      thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()),
1351                                                      matrixT->values->begin()));
1352   }
1353   PetscCall(PetscLogGpuTimeEnd());
1354   PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
1355   /* the compressed row indices is not used for matTranspose */
1356   matstructT->cprowIndices = NULL;
1357   /* assign the pointer */
1358   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1359   A->transupdated = PETSC_TRUE;
1360   PetscFunctionReturn(0);
1361 }
1362 
1363 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1364 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1365 {
1366   PetscInt                              n = xx->map->n;
1367   const PetscScalar                     *barray;
1368   PetscScalar                           *xarray;
1369   thrust::device_ptr<const PetscScalar> bGPU;
1370   thrust::device_ptr<PetscScalar>       xGPU;
1371   cusparseStatus_t                      stat;
1372   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1373   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1374   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1375   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1376 
1377   PetscFunctionBegin;
1378   /* Analyze the matrix and create the transpose ... on the fly */
1379   if (!loTriFactorT && !upTriFactorT) {
1380     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1381     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1382     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1383   }
1384 
1385   /* Get the GPU pointers */
1386   PetscCall(VecCUDAGetArrayWrite(xx,&xarray));
1387   PetscCall(VecCUDAGetArrayRead(bb,&barray));
1388   xGPU = thrust::device_pointer_cast(xarray);
1389   bGPU = thrust::device_pointer_cast(barray);
1390 
1391   PetscCall(PetscLogGpuTimeBegin());
1392   /* First, reorder with the row permutation */
1393   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1394                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1395                xGPU);
1396 
1397   /* First, solve U */
1398   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1399                         upTriFactorT->csrMat->num_rows,
1400                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1401                         upTriFactorT->csrMat->num_entries,
1402                       #endif
1403                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1404                         upTriFactorT->csrMat->values->data().get(),
1405                         upTriFactorT->csrMat->row_offsets->data().get(),
1406                         upTriFactorT->csrMat->column_indices->data().get(),
1407                         upTriFactorT->solveInfo,
1408                         xarray,
1409                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1410                         tempGPU->data().get(),
1411                         upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);PetscCallCUSPARSE(stat);
1412                       #else
1413                         tempGPU->data().get());PetscCallCUSPARSE(stat);
1414                       #endif
1415 
1416   /* Then, solve L */
1417   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1418                         loTriFactorT->csrMat->num_rows,
1419                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1420                         loTriFactorT->csrMat->num_entries,
1421                       #endif
1422                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1423                         loTriFactorT->csrMat->values->data().get(),
1424                         loTriFactorT->csrMat->row_offsets->data().get(),
1425                         loTriFactorT->csrMat->column_indices->data().get(),
1426                         loTriFactorT->solveInfo,
1427                         tempGPU->data().get(),
1428                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1429                         xarray,
1430                         loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);PetscCallCUSPARSE(stat);
1431                       #else
1432                          xarray);PetscCallCUSPARSE(stat);
1433                       #endif
1434 
1435   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1436   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1437                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1438                tempGPU->begin());
1439 
1440   /* Copy the temporary to the full solution. */
1441   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU);
1442 
1443   /* restore */
1444   PetscCall(VecCUDARestoreArrayRead(bb,&barray));
1445   PetscCall(VecCUDARestoreArrayWrite(xx,&xarray));
1446   PetscCall(PetscLogGpuTimeEnd());
1447   PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n));
1448   PetscFunctionReturn(0);
1449 }
1450 
1451 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1452 {
1453   const PetscScalar                 *barray;
1454   PetscScalar                       *xarray;
1455   cusparseStatus_t                  stat;
1456   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1457   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1458   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1459   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1460 
1461   PetscFunctionBegin;
1462   /* Analyze the matrix and create the transpose ... on the fly */
1463   if (!loTriFactorT && !upTriFactorT) {
1464     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1465     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1466     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1467   }
1468 
1469   /* Get the GPU pointers */
1470   PetscCall(VecCUDAGetArrayWrite(xx,&xarray));
1471   PetscCall(VecCUDAGetArrayRead(bb,&barray));
1472 
1473   PetscCall(PetscLogGpuTimeBegin());
1474   /* First, solve U */
1475   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1476                         upTriFactorT->csrMat->num_rows,
1477                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1478                         upTriFactorT->csrMat->num_entries,
1479                       #endif
1480                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1481                         upTriFactorT->csrMat->values->data().get(),
1482                         upTriFactorT->csrMat->row_offsets->data().get(),
1483                         upTriFactorT->csrMat->column_indices->data().get(),
1484                         upTriFactorT->solveInfo,
1485                         barray,
1486                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1487                         tempGPU->data().get(),
1488                         upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);PetscCallCUSPARSE(stat);
1489                       #else
1490                         tempGPU->data().get());PetscCallCUSPARSE(stat);
1491                       #endif
1492 
1493   /* Then, solve L */
1494   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1495                         loTriFactorT->csrMat->num_rows,
1496                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1497                         loTriFactorT->csrMat->num_entries,
1498                       #endif
1499                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1500                         loTriFactorT->csrMat->values->data().get(),
1501                         loTriFactorT->csrMat->row_offsets->data().get(),
1502                         loTriFactorT->csrMat->column_indices->data().get(),
1503                         loTriFactorT->solveInfo,
1504                         tempGPU->data().get(),
1505                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1506                         xarray,
1507                         loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);PetscCallCUSPARSE(stat);
1508                       #else
1509                         xarray);PetscCallCUSPARSE(stat);
1510                       #endif
1511 
1512   /* restore */
1513   PetscCall(VecCUDARestoreArrayRead(bb,&barray));
1514   PetscCall(VecCUDARestoreArrayWrite(xx,&xarray));
1515   PetscCall(PetscLogGpuTimeEnd());
1516   PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n));
1517   PetscFunctionReturn(0);
1518 }
1519 
1520 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1521 {
1522   const PetscScalar                     *barray;
1523   PetscScalar                           *xarray;
1524   thrust::device_ptr<const PetscScalar> bGPU;
1525   thrust::device_ptr<PetscScalar>       xGPU;
1526   cusparseStatus_t                      stat;
1527   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1528   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1529   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1530   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1531 
1532   PetscFunctionBegin;
1533 
1534   /* Get the GPU pointers */
1535   PetscCall(VecCUDAGetArrayWrite(xx,&xarray));
1536   PetscCall(VecCUDAGetArrayRead(bb,&barray));
1537   xGPU = thrust::device_pointer_cast(xarray);
1538   bGPU = thrust::device_pointer_cast(barray);
1539 
1540   PetscCall(PetscLogGpuTimeBegin());
1541   /* First, reorder with the row permutation */
1542   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1543                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1544                tempGPU->begin());
1545 
1546   /* Next, solve L */
1547   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1548                         loTriFactor->csrMat->num_rows,
1549                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1550                         loTriFactor->csrMat->num_entries,
1551                       #endif
1552                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1553                         loTriFactor->csrMat->values->data().get(),
1554                         loTriFactor->csrMat->row_offsets->data().get(),
1555                         loTriFactor->csrMat->column_indices->data().get(),
1556                         loTriFactor->solveInfo,
1557                         tempGPU->data().get(),
1558                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1559                          xarray,
1560                          loTriFactor->solvePolicy, loTriFactor->solveBuffer);PetscCallCUSPARSE(stat);
1561                       #else
1562                          xarray);PetscCallCUSPARSE(stat);
1563                       #endif
1564 
1565   /* Then, solve U */
1566   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1567                         upTriFactor->csrMat->num_rows,
1568                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1569                         upTriFactor->csrMat->num_entries,
1570                       #endif
1571                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1572                         upTriFactor->csrMat->values->data().get(),
1573                         upTriFactor->csrMat->row_offsets->data().get(),
1574                         upTriFactor->csrMat->column_indices->data().get(),
1575                         upTriFactor->solveInfo,xarray,
1576                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1577                         tempGPU->data().get(),
1578                         upTriFactor->solvePolicy, upTriFactor->solveBuffer);PetscCallCUSPARSE(stat);
1579                       #else
1580                         tempGPU->data().get());PetscCallCUSPARSE(stat);
1581                       #endif
1582 
1583   /* Last, reorder with the column permutation */
1584   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1585                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
1586                xGPU);
1587 
1588   PetscCall(VecCUDARestoreArrayRead(bb,&barray));
1589   PetscCall(VecCUDARestoreArrayWrite(xx,&xarray));
1590   PetscCall(PetscLogGpuTimeEnd());
1591   PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n));
1592   PetscFunctionReturn(0);
1593 }
1594 
1595 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1596 {
1597   const PetscScalar                 *barray;
1598   PetscScalar                       *xarray;
1599   cusparseStatus_t                  stat;
1600   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1601   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1602   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1603   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1604 
1605   PetscFunctionBegin;
1606   /* Get the GPU pointers */
1607   PetscCall(VecCUDAGetArrayWrite(xx,&xarray));
1608   PetscCall(VecCUDAGetArrayRead(bb,&barray));
1609 
1610   PetscCall(PetscLogGpuTimeBegin());
1611   /* First, solve L */
1612   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1613                         loTriFactor->csrMat->num_rows,
1614                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1615                         loTriFactor->csrMat->num_entries,
1616                       #endif
1617                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1618                         loTriFactor->csrMat->values->data().get(),
1619                         loTriFactor->csrMat->row_offsets->data().get(),
1620                         loTriFactor->csrMat->column_indices->data().get(),
1621                         loTriFactor->solveInfo,
1622                         barray,
1623                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1624                         tempGPU->data().get(),
1625                         loTriFactor->solvePolicy,loTriFactor->solveBuffer);PetscCallCUSPARSE(stat);
1626                       #else
1627                         tempGPU->data().get());PetscCallCUSPARSE(stat);
1628                       #endif
1629 
1630   /* Next, solve U */
1631   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1632                         upTriFactor->csrMat->num_rows,
1633                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1634                         upTriFactor->csrMat->num_entries,
1635                       #endif
1636                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1637                         upTriFactor->csrMat->values->data().get(),
1638                         upTriFactor->csrMat->row_offsets->data().get(),
1639                         upTriFactor->csrMat->column_indices->data().get(),
1640                         upTriFactor->solveInfo,
1641                         tempGPU->data().get(),
1642                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1643                         xarray,
1644                         upTriFactor->solvePolicy, upTriFactor->solveBuffer);PetscCallCUSPARSE(stat);
1645                       #else
1646                         xarray);PetscCallCUSPARSE(stat);
1647                       #endif
1648 
1649   PetscCall(VecCUDARestoreArrayRead(bb,&barray));
1650   PetscCall(VecCUDARestoreArrayWrite(xx,&xarray));
1651   PetscCall(PetscLogGpuTimeEnd());
1652   PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n));
1653   PetscFunctionReturn(0);
1654 }
1655 
1656 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
1657 {
1658   PetscFunctionBegin;
1659   *type = MATSOLVERCUSPARSE;
1660   PetscFunctionReturn(0);
1661 }
1662 
1663 /*MC
1664   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
1665   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
1666   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
1667   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
1668   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
1669   algorithms are not recommended. This class does NOT support direct solver operations.
1670 
1671   Level: beginner
1672 
1673 .seealso: `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
1674 M*/
1675 
1676 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
1677 {
1678   PetscInt       n = A->rmap->n;
1679 
1680   PetscFunctionBegin;
1681   PetscCall(MatCreate(PetscObjectComm((PetscObject)A),B));
1682   PetscCall(MatSetSizes(*B,n,n,n,n));
1683   (*B)->factortype = ftype;
1684   PetscCall(MatSetType(*B,MATSEQAIJCUSPARSE));
1685 
1686   if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B,PETSC_TRUE));
1687   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
1688     PetscCall(MatSetBlockSizesFromMats(*B,A,A));
1689     if (!A->boundtocpu) {
1690       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
1691       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
1692     } else {
1693       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
1694       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJ;
1695     }
1696     PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]));
1697     PetscCall(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU]));
1698     PetscCall(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
1699   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
1700     if (!A->boundtocpu) {
1701       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
1702       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
1703     } else {
1704       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJ;
1705       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
1706     }
1707     PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
1708     PetscCall(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC]));
1709   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
1710 
1711   PetscCall(MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL));
1712   (*B)->canuseordering = PETSC_TRUE;
1713   PetscCall(PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse));
1714   PetscFunctionReturn(0);
1715 }
1716 
1717 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
1718 {
1719   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1720   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1721 
1722   PetscFunctionBegin;
1723   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
1724     CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat;
1725 
1726     PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0));
1727     PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost));
1728     PetscCallCUDA(WaitForCUDA());
1729     PetscCall(PetscLogGpuToCpu(a->nz*sizeof(PetscScalar)));
1730     PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0));
1731     A->offloadmask = PETSC_OFFLOAD_BOTH;
1732   }
1733   PetscFunctionReturn(0);
1734 }
1735 
1736 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1737 {
1738   PetscFunctionBegin;
1739   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
1740   *array = ((Mat_SeqAIJ*)A->data)->a;
1741   PetscFunctionReturn(0);
1742 }
1743 
1744 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1745 {
1746   PetscFunctionBegin;
1747   A->offloadmask = PETSC_OFFLOAD_CPU;
1748   *array         = NULL;
1749   PetscFunctionReturn(0);
1750 }
1751 
1752 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[])
1753 {
1754   PetscFunctionBegin;
1755   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
1756   *array = ((Mat_SeqAIJ*)A->data)->a;
1757   PetscFunctionReturn(0);
1758 }
1759 
1760 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[])
1761 {
1762   PetscFunctionBegin;
1763   *array = NULL;
1764   PetscFunctionReturn(0);
1765 }
1766 
1767 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1768 {
1769   PetscFunctionBegin;
1770   *array = ((Mat_SeqAIJ*)A->data)->a;
1771   PetscFunctionReturn(0);
1772 }
1773 
1774 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1775 {
1776   PetscFunctionBegin;
1777   A->offloadmask = PETSC_OFFLOAD_CPU;
1778   *array         = NULL;
1779   PetscFunctionReturn(0);
1780 }
1781 
1782 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A,const PetscInt **i,const PetscInt **j,PetscScalar **a,PetscMemType *mtype)
1783 {
1784   Mat_SeqAIJCUSPARSE           *cusp;
1785   CsrMatrix                    *matrix;
1786 
1787   PetscFunctionBegin;
1788   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1789   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1790   cusp = static_cast<Mat_SeqAIJCUSPARSE*>(A->spptr);
1791   PetscCheck(cusp != NULL,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"cusp is NULL");
1792   matrix = (CsrMatrix*)cusp->mat->mat;
1793 
1794   if (i) {
1795    #if !defined(PETSC_USE_64BIT_INDICES)
1796     *i = matrix->row_offsets->data().get();
1797    #else
1798     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSparse does not supported 64-bit indices");
1799    #endif
1800   }
1801   if (j) {
1802    #if !defined(PETSC_USE_64BIT_INDICES)
1803     *j = matrix->column_indices->data().get();
1804    #else
1805     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSparse does not supported 64-bit indices");
1806    #endif
1807   }
1808   if (a) *a = matrix->values->data().get();
1809   if (mtype) *mtype = PETSC_MEMTYPE_CUDA;
1810   PetscFunctionReturn(0);
1811 }
1812 
1813 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1814 {
1815   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1816   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
1817   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1818   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
1819   cusparseStatus_t             stat;
1820   PetscBool                    both = PETSC_TRUE;
1821 
1822   PetscFunctionBegin;
1823   PetscCheck(!A->boundtocpu,PETSC_COMM_SELF,PETSC_ERR_GPU,"Cannot copy to GPU");
1824   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1825     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
1826       CsrMatrix *matrix;
1827       matrix = (CsrMatrix*)cusparsestruct->mat->mat;
1828 
1829       PetscCheck(!a->nz || a->a,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR values");
1830       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0));
1831       matrix->values->assign(a->a, a->a+a->nz);
1832       PetscCallCUDA(WaitForCUDA());
1833       PetscCall(PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar)));
1834       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0));
1835       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE));
1836     } else {
1837       PetscInt nnz;
1838       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0));
1839       PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format));
1840       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE));
1841       delete cusparsestruct->workVector;
1842       delete cusparsestruct->rowoffsets_gpu;
1843       cusparsestruct->workVector = NULL;
1844       cusparsestruct->rowoffsets_gpu = NULL;
1845       try {
1846         if (a->compressedrow.use) {
1847           m    = a->compressedrow.nrows;
1848           ii   = a->compressedrow.i;
1849           ridx = a->compressedrow.rindex;
1850         } else {
1851           m    = A->rmap->n;
1852           ii   = a->i;
1853           ridx = NULL;
1854         }
1855         PetscCheck(ii,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR row data");
1856         if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; }
1857         else nnz = a->nz;
1858         PetscCheck(!nnz || a->j,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR column data");
1859 
1860         /* create cusparse matrix */
1861         cusparsestruct->nrows = m;
1862         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1863         PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr));
1864         PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO));
1865         PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
1866 
1867         PetscCallCUDA(cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar)));
1868         PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar)));
1869         PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar)));
1870         PetscCallCUDA(cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
1871         PetscCallCUDA(cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice));
1872         PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
1873         PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE));
1874 
1875         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1876         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1877           /* set the matrix */
1878           CsrMatrix *mat= new CsrMatrix;
1879           mat->num_rows = m;
1880           mat->num_cols = A->cmap->n;
1881           mat->num_entries = nnz;
1882           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1883           mat->row_offsets->assign(ii, ii + m+1);
1884 
1885           mat->column_indices = new THRUSTINTARRAY32(nnz);
1886           mat->column_indices->assign(a->j, a->j+nnz);
1887 
1888           mat->values = new THRUSTARRAY(nnz);
1889           if (a->a) mat->values->assign(a->a, a->a+nnz);
1890 
1891           /* assign the pointer */
1892           matstruct->mat = mat;
1893          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1894           if (mat->num_rows) { /* cusparse errors on empty matrices! */
1895             stat = cusparseCreateCsr(&matstruct->matDescr,
1896                                     mat->num_rows, mat->num_cols, mat->num_entries,
1897                                     mat->row_offsets->data().get(), mat->column_indices->data().get(),
1898                                     mat->values->data().get(),
1899                                     CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1900                                     CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);PetscCallCUSPARSE(stat);
1901           }
1902          #endif
1903         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1904          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1905           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1906          #else
1907           CsrMatrix *mat= new CsrMatrix;
1908           mat->num_rows = m;
1909           mat->num_cols = A->cmap->n;
1910           mat->num_entries = nnz;
1911           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1912           mat->row_offsets->assign(ii, ii + m+1);
1913 
1914           mat->column_indices = new THRUSTINTARRAY32(nnz);
1915           mat->column_indices->assign(a->j, a->j+nnz);
1916 
1917           mat->values = new THRUSTARRAY(nnz);
1918           if (a->a) mat->values->assign(a->a, a->a+nnz);
1919 
1920           cusparseHybMat_t hybMat;
1921           PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
1922           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1923             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1924           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
1925               matstruct->descr, mat->values->data().get(),
1926               mat->row_offsets->data().get(),
1927               mat->column_indices->data().get(),
1928               hybMat, 0, partition);PetscCallCUSPARSE(stat);
1929           /* assign the pointer */
1930           matstruct->mat = hybMat;
1931 
1932           if (mat) {
1933             if (mat->values) delete (THRUSTARRAY*)mat->values;
1934             if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1935             if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1936             delete (CsrMatrix*)mat;
1937           }
1938          #endif
1939         }
1940 
1941         /* assign the compressed row indices */
1942         if (a->compressedrow.use) {
1943           cusparsestruct->workVector = new THRUSTARRAY(m);
1944           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
1945           matstruct->cprowIndices->assign(ridx,ridx+m);
1946           tmp = m;
1947         } else {
1948           cusparsestruct->workVector = NULL;
1949           matstruct->cprowIndices    = NULL;
1950           tmp = 0;
1951         }
1952         PetscCall(PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar)));
1953 
1954         /* assign the pointer */
1955         cusparsestruct->mat = matstruct;
1956       } catch(char *ex) {
1957         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1958       }
1959       PetscCallCUDA(WaitForCUDA());
1960       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0));
1961       cusparsestruct->nonzerostate = A->nonzerostate;
1962     }
1963     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
1964   }
1965   PetscFunctionReturn(0);
1966 }
1967 
1968 struct VecCUDAPlusEquals
1969 {
1970   template <typename Tuple>
1971   __host__ __device__
1972   void operator()(Tuple t)
1973   {
1974     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1975   }
1976 };
1977 
1978 struct VecCUDAEquals
1979 {
1980   template <typename Tuple>
1981   __host__ __device__
1982   void operator()(Tuple t)
1983   {
1984     thrust::get<1>(t) = thrust::get<0>(t);
1985   }
1986 };
1987 
1988 struct VecCUDAEqualsReverse
1989 {
1990   template <typename Tuple>
1991   __host__ __device__
1992   void operator()(Tuple t)
1993   {
1994     thrust::get<0>(t) = thrust::get<1>(t);
1995   }
1996 };
1997 
1998 struct MatMatCusparse {
1999   PetscBool             cisdense;
2000   PetscScalar           *Bt;
2001   Mat                   X;
2002   PetscBool             reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
2003   PetscLogDouble        flops;
2004   CsrMatrix             *Bcsr;
2005 
2006 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2007   cusparseSpMatDescr_t  matSpBDescr;
2008   PetscBool             initialized;   /* C = alpha op(A) op(B) + beta C */
2009   cusparseDnMatDescr_t  matBDescr;
2010   cusparseDnMatDescr_t  matCDescr;
2011   PetscInt              Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
2012  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2013   void                  *dBuffer4;
2014   void                  *dBuffer5;
2015  #endif
2016   size_t                mmBufferSize;
2017   void                  *mmBuffer;
2018   void                  *mmBuffer2; /* SpGEMM WorkEstimation buffer */
2019   cusparseSpGEMMDescr_t spgemmDesc;
2020 #endif
2021 };
2022 
2023 static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
2024 {
2025   MatMatCusparse *mmdata = (MatMatCusparse *)data;
2026 
2027   PetscFunctionBegin;
2028   PetscCallCUDA(cudaFree(mmdata->Bt));
2029   delete mmdata->Bcsr;
2030  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2031   if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr));
2032   if (mmdata->matBDescr)   PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
2033   if (mmdata->matCDescr)   PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
2034   if (mmdata->spgemmDesc)  PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc));
2035  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2036   if (mmdata->dBuffer4)  PetscCallCUDA(cudaFree(mmdata->dBuffer4));
2037   if (mmdata->dBuffer5)  PetscCallCUDA(cudaFree(mmdata->dBuffer5));
2038  #endif
2039   if (mmdata->mmBuffer)  PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2040   if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2));
2041  #endif
2042   PetscCall(MatDestroy(&mmdata->X));
2043   PetscCall(PetscFree(data));
2044   PetscFunctionReturn(0);
2045 }
2046 
2047 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
2048 
2049 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2050 {
2051   Mat_Product                  *product = C->product;
2052   Mat                          A,B;
2053   PetscInt                     m,n,blda,clda;
2054   PetscBool                    flg,biscuda;
2055   Mat_SeqAIJCUSPARSE           *cusp;
2056   cusparseStatus_t             stat;
2057   cusparseOperation_t          opA;
2058   const PetscScalar            *barray;
2059   PetscScalar                  *carray;
2060   MatMatCusparse               *mmdata;
2061   Mat_SeqAIJCUSPARSEMultStruct *mat;
2062   CsrMatrix                    *csrmat;
2063 
2064   PetscFunctionBegin;
2065   MatCheckProduct(C,1);
2066   PetscCheck(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
2067   mmdata = (MatMatCusparse*)product->data;
2068   A    = product->A;
2069   B    = product->B;
2070   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
2071   PetscCheck(flg,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2072   /* currently CopyToGpu does not copy if the matrix is bound to CPU
2073      Instead of silently accepting the wrong answer, I prefer to raise the error */
2074   PetscCheck(!A->boundtocpu,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2075   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2076   cusp   = (Mat_SeqAIJCUSPARSE*)A->spptr;
2077   switch (product->type) {
2078   case MATPRODUCT_AB:
2079   case MATPRODUCT_PtAP:
2080     mat = cusp->mat;
2081     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2082     m   = A->rmap->n;
2083     n   = B->cmap->n;
2084     break;
2085   case MATPRODUCT_AtB:
2086     if (!A->form_explicit_transpose) {
2087       mat = cusp->mat;
2088       opA = CUSPARSE_OPERATION_TRANSPOSE;
2089     } else {
2090       PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2091       mat  = cusp->matTranspose;
2092       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
2093     }
2094     m = A->cmap->n;
2095     n = B->cmap->n;
2096     break;
2097   case MATPRODUCT_ABt:
2098   case MATPRODUCT_RARt:
2099     mat = cusp->mat;
2100     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2101     m   = A->rmap->n;
2102     n   = B->rmap->n;
2103     break;
2104   default:
2105     SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2106   }
2107   PetscCheck(mat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing Mat_SeqAIJCUSPARSEMultStruct");
2108   csrmat = (CsrMatrix*)mat->mat;
2109   /* if the user passed a CPU matrix, copy the data to the GPU */
2110   PetscCall(PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda));
2111   if (!biscuda) PetscCall(MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B));
2112   PetscCall(MatDenseCUDAGetArrayRead(B,&barray));
2113 
2114   PetscCall(MatDenseGetLDA(B,&blda));
2115   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2116     PetscCall(MatDenseCUDAGetArrayWrite(mmdata->X,&carray));
2117     PetscCall(MatDenseGetLDA(mmdata->X,&clda));
2118   } else {
2119     PetscCall(MatDenseCUDAGetArrayWrite(C,&carray));
2120     PetscCall(MatDenseGetLDA(C,&clda));
2121   }
2122 
2123   PetscCall(PetscLogGpuTimeBegin());
2124  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2125   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2126   /* (re)allocate mmBuffer if not initialized or LDAs are different */
2127   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2128     size_t mmBufferSize;
2129     if (mmdata->initialized && mmdata->Blda != blda) {PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); mmdata->matBDescr = NULL;}
2130     if (!mmdata->matBDescr) {
2131       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL));
2132       mmdata->Blda = blda;
2133     }
2134 
2135     if (mmdata->initialized && mmdata->Clda != clda) {PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); mmdata->matCDescr = NULL;}
2136     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2137       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL));
2138       mmdata->Clda = clda;
2139     }
2140 
2141     if (!mat->matDescr) {
2142       stat = cusparseCreateCsr(&mat->matDescr,
2143                                csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
2144                                csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
2145                                csrmat->values->data().get(),
2146                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2147                                CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);PetscCallCUSPARSE(stat);
2148     }
2149     stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
2150                                    mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2151                                    mmdata->matCDescr,cusparse_scalartype,
2152                                    cusp->spmmAlg,&mmBufferSize);PetscCallCUSPARSE(stat);
2153     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2154       PetscCallCUDA(cudaFree(mmdata->mmBuffer));
2155       PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer,mmBufferSize));
2156       mmdata->mmBufferSize = mmBufferSize;
2157     }
2158     mmdata->initialized = PETSC_TRUE;
2159   } else {
2160     /* to be safe, always update pointers of the mats */
2161     PetscCallCUSPARSE(cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get()));
2162     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray));
2163     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray));
2164   }
2165 
2166   /* do cusparseSpMM, which supports transpose on B */
2167   stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
2168                       mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2169                       mmdata->matCDescr,cusparse_scalartype,
2170                       cusp->spmmAlg,mmdata->mmBuffer);PetscCallCUSPARSE(stat);
2171  #else
2172   PetscInt k;
2173   /* cusparseXcsrmm does not support transpose on B */
2174   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2175     cublasHandle_t cublasv2handle;
2176     cublasStatus_t cerr;
2177 
2178     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
2179     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
2180                        B->cmap->n,B->rmap->n,
2181                        &PETSC_CUSPARSE_ONE ,barray,blda,
2182                        &PETSC_CUSPARSE_ZERO,barray,blda,
2183                        mmdata->Bt,B->cmap->n);PetscCallCUBLAS(cerr);
2184     blda = B->cmap->n;
2185     k    = B->cmap->n;
2186   } else {
2187     k    = B->rmap->n;
2188   }
2189 
2190   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2191   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
2192                            csrmat->num_entries,mat->alpha_one,mat->descr,
2193                            csrmat->values->data().get(),
2194                            csrmat->row_offsets->data().get(),
2195                            csrmat->column_indices->data().get(),
2196                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
2197                            carray,clda);PetscCallCUSPARSE(stat);
2198  #endif
2199   PetscCall(PetscLogGpuTimeEnd());
2200   PetscCall(PetscLogGpuFlops(n*2.0*csrmat->num_entries));
2201   PetscCall(MatDenseCUDARestoreArrayRead(B,&barray));
2202   if (product->type == MATPRODUCT_RARt) {
2203     PetscCall(MatDenseCUDARestoreArrayWrite(mmdata->X,&carray));
2204     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE));
2205   } else if (product->type == MATPRODUCT_PtAP) {
2206     PetscCall(MatDenseCUDARestoreArrayWrite(mmdata->X,&carray));
2207     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE));
2208   } else {
2209     PetscCall(MatDenseCUDARestoreArrayWrite(C,&carray));
2210   }
2211   if (mmdata->cisdense) {
2212     PetscCall(MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C));
2213   }
2214   if (!biscuda) {
2215     PetscCall(MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B));
2216   }
2217   PetscFunctionReturn(0);
2218 }
2219 
2220 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2221 {
2222   Mat_Product        *product = C->product;
2223   Mat                A,B;
2224   PetscInt           m,n;
2225   PetscBool          cisdense,flg;
2226   MatMatCusparse     *mmdata;
2227   Mat_SeqAIJCUSPARSE *cusp;
2228 
2229   PetscFunctionBegin;
2230   MatCheckProduct(C,1);
2231   PetscCheck(!C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
2232   A    = product->A;
2233   B    = product->B;
2234   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
2235   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2236   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2237   PetscCheck(cusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2238   switch (product->type) {
2239   case MATPRODUCT_AB:
2240     m = A->rmap->n;
2241     n = B->cmap->n;
2242     break;
2243   case MATPRODUCT_AtB:
2244     m = A->cmap->n;
2245     n = B->cmap->n;
2246     break;
2247   case MATPRODUCT_ABt:
2248     m = A->rmap->n;
2249     n = B->rmap->n;
2250     break;
2251   case MATPRODUCT_PtAP:
2252     m = B->cmap->n;
2253     n = B->cmap->n;
2254     break;
2255   case MATPRODUCT_RARt:
2256     m = B->rmap->n;
2257     n = B->rmap->n;
2258     break;
2259   default:
2260     SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2261   }
2262   PetscCall(MatSetSizes(C,m,n,m,n));
2263   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2264   PetscCall(PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense));
2265   PetscCall(MatSetType(C,MATSEQDENSECUDA));
2266 
2267   /* product data */
2268   PetscCall(PetscNew(&mmdata));
2269   mmdata->cisdense = cisdense;
2270  #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
2271   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2272   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2273     PetscCallCUDA(cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar)));
2274   }
2275  #endif
2276   /* for these products we need intermediate storage */
2277   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2278     PetscCall(MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X));
2279     PetscCall(MatSetType(mmdata->X,MATSEQDENSECUDA));
2280     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2281       PetscCall(MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n));
2282     } else {
2283       PetscCall(MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n));
2284     }
2285   }
2286   C->product->data    = mmdata;
2287   C->product->destroy = MatDestroy_MatMatCusparse;
2288 
2289   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2290   PetscFunctionReturn(0);
2291 }
2292 
2293 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2294 {
2295   Mat_Product                  *product = C->product;
2296   Mat                          A,B;
2297   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2298   Mat_SeqAIJ                   *c = (Mat_SeqAIJ*)C->data;
2299   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2300   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2301   PetscBool                    flg;
2302   cusparseStatus_t             stat;
2303   MatProductType               ptype;
2304   MatMatCusparse               *mmdata;
2305 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2306   cusparseSpMatDescr_t         BmatSpDescr;
2307 #endif
2308   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2309 
2310   PetscFunctionBegin;
2311   MatCheckProduct(C,1);
2312   PetscCheck(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
2313   PetscCall(PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg));
2314   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for C of type %s",((PetscObject)C)->type_name);
2315   mmdata = (MatMatCusparse*)C->product->data;
2316   A = product->A;
2317   B = product->B;
2318   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2319     mmdata->reusesym = PETSC_FALSE;
2320     Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2321     PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2322     Cmat = Ccusp->mat;
2323     PetscCheck(Cmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]);
2324     Ccsr = (CsrMatrix*)Cmat->mat;
2325     PetscCheck(Ccsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
2326     goto finalize;
2327   }
2328   if (!c->nz) goto finalize;
2329   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
2330   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2331   PetscCall(PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg));
2332   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
2333   PetscCheck(!A->boundtocpu,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2334   PetscCheck(!B->boundtocpu,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2335   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2336   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2337   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2338   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2339   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2340   PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2341   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2342   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
2343 
2344   ptype = product->type;
2345   if (A->symmetric && ptype == MATPRODUCT_AtB) {
2346     ptype = MATPRODUCT_AB;
2347     PetscCheck(product->symbolic_used_the_fact_A_is_symmetric,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that A is symmetric");
2348   }
2349   if (B->symmetric && ptype == MATPRODUCT_ABt) {
2350     ptype = MATPRODUCT_AB;
2351     PetscCheck(product->symbolic_used_the_fact_B_is_symmetric,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that B is symmetric");
2352   }
2353   switch (ptype) {
2354   case MATPRODUCT_AB:
2355     Amat = Acusp->mat;
2356     Bmat = Bcusp->mat;
2357     break;
2358   case MATPRODUCT_AtB:
2359     Amat = Acusp->matTranspose;
2360     Bmat = Bcusp->mat;
2361     break;
2362   case MATPRODUCT_ABt:
2363     Amat = Acusp->mat;
2364     Bmat = Bcusp->matTranspose;
2365     break;
2366   default:
2367     SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2368   }
2369   Cmat = Ccusp->mat;
2370   PetscCheck(Amat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2371   PetscCheck(Bmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2372   PetscCheck(Cmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[ptype]);
2373   Acsr = (CsrMatrix*)Amat->mat;
2374   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */
2375   Ccsr = (CsrMatrix*)Cmat->mat;
2376   PetscCheck(Acsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
2377   PetscCheck(Bcsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
2378   PetscCheck(Ccsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
2379   PetscCall(PetscLogGpuTimeBegin());
2380 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2381   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2382   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2383   #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2384     stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB,
2385                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2386                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2387                                mmdata->spgemmDesc);PetscCallCUSPARSE(stat);
2388   #else
2389     stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2390                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2391                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2392                                mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);PetscCallCUSPARSE(stat);
2393     stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB,
2394                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2395                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);PetscCallCUSPARSE(stat);
2396   #endif
2397 #else
2398   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB,
2399                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2400                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2401                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2402                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());PetscCallCUSPARSE(stat);
2403 #endif
2404   PetscCall(PetscLogGpuFlops(mmdata->flops));
2405   PetscCallCUDA(WaitForCUDA());
2406   PetscCall(PetscLogGpuTimeEnd());
2407   C->offloadmask = PETSC_OFFLOAD_GPU;
2408 finalize:
2409   /* shorter version of MatAssemblyEnd_SeqAIJ */
2410   PetscCall(PetscInfo(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz));
2411   PetscCall(PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n"));
2412   PetscCall(PetscInfo(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax));
2413   c->reallocs         = 0;
2414   C->info.mallocs    += 0;
2415   C->info.nz_unneeded = 0;
2416   C->assembled = C->was_assembled = PETSC_TRUE;
2417   C->num_ass++;
2418   PetscFunctionReturn(0);
2419 }
2420 
2421 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2422 {
2423   Mat_Product                  *product = C->product;
2424   Mat                          A,B;
2425   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2426   Mat_SeqAIJ                   *a,*b,*c;
2427   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2428   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2429   PetscInt                     i,j,m,n,k;
2430   PetscBool                    flg;
2431   cusparseStatus_t             stat;
2432   MatProductType               ptype;
2433   MatMatCusparse               *mmdata;
2434   PetscLogDouble               flops;
2435   PetscBool                    biscompressed,ciscompressed;
2436 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2437   int64_t                      C_num_rows1, C_num_cols1, C_nnz1;
2438   cusparseSpMatDescr_t         BmatSpDescr;
2439 #else
2440   int                          cnz;
2441 #endif
2442   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2443 
2444   PetscFunctionBegin;
2445   MatCheckProduct(C,1);
2446   PetscCheck(!C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
2447   A    = product->A;
2448   B    = product->B;
2449   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
2450   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2451   PetscCall(PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg));
2452   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
2453   a = (Mat_SeqAIJ*)A->data;
2454   b = (Mat_SeqAIJ*)B->data;
2455   /* product data */
2456   PetscCall(PetscNew(&mmdata));
2457   C->product->data    = mmdata;
2458   C->product->destroy = MatDestroy_MatMatCusparse;
2459 
2460   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2461   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
2462   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */
2463   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2464   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2465   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2466 
2467   ptype = product->type;
2468   if (A->symmetric && ptype == MATPRODUCT_AtB) {
2469     ptype = MATPRODUCT_AB;
2470     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
2471   }
2472   if (B->symmetric && ptype == MATPRODUCT_ABt) {
2473     ptype = MATPRODUCT_AB;
2474     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
2475   }
2476   biscompressed = PETSC_FALSE;
2477   ciscompressed = PETSC_FALSE;
2478   switch (ptype) {
2479   case MATPRODUCT_AB:
2480     m = A->rmap->n;
2481     n = B->cmap->n;
2482     k = A->cmap->n;
2483     Amat = Acusp->mat;
2484     Bmat = Bcusp->mat;
2485     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2486     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2487     break;
2488   case MATPRODUCT_AtB:
2489     m = A->cmap->n;
2490     n = B->cmap->n;
2491     k = A->rmap->n;
2492     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2493     Amat = Acusp->matTranspose;
2494     Bmat = Bcusp->mat;
2495     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2496     break;
2497   case MATPRODUCT_ABt:
2498     m = A->rmap->n;
2499     n = B->rmap->n;
2500     k = A->cmap->n;
2501     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
2502     Amat = Acusp->mat;
2503     Bmat = Bcusp->matTranspose;
2504     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2505     break;
2506   default:
2507     SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2508   }
2509 
2510   /* create cusparse matrix */
2511   PetscCall(MatSetSizes(C,m,n,m,n));
2512   PetscCall(MatSetType(C,MATSEQAIJCUSPARSE));
2513   c     = (Mat_SeqAIJ*)C->data;
2514   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2515   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
2516   Ccsr  = new CsrMatrix;
2517 
2518   c->compressedrow.use = ciscompressed;
2519   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2520     c->compressedrow.nrows = a->compressedrow.nrows;
2521     PetscCall(PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex));
2522     PetscCall(PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows));
2523     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
2524     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2525     Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows);
2526   } else {
2527     c->compressedrow.nrows  = 0;
2528     c->compressedrow.i      = NULL;
2529     c->compressedrow.rindex = NULL;
2530     Ccusp->workVector       = NULL;
2531     Cmat->cprowIndices      = NULL;
2532   }
2533   Ccusp->nrows    = ciscompressed ? c->compressedrow.nrows : m;
2534   Ccusp->mat      = Cmat;
2535   Ccusp->mat->mat = Ccsr;
2536   Ccsr->num_rows    = Ccusp->nrows;
2537   Ccsr->num_cols    = n;
2538   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1);
2539   PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
2540   PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
2541   PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
2542   PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar)));
2543   PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar)));
2544   PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar)));
2545   PetscCallCUDA(cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
2546   PetscCallCUDA(cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice));
2547   PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
2548   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
2549     thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0);
2550     c->nz = 0;
2551     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2552     Ccsr->values = new THRUSTARRAY(c->nz);
2553     goto finalizesym;
2554   }
2555 
2556   PetscCheck(Amat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2557   PetscCheck(Bmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2558   Acsr = (CsrMatrix*)Amat->mat;
2559   if (!biscompressed) {
2560     Bcsr = (CsrMatrix*)Bmat->mat;
2561 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2562     BmatSpDescr = Bmat->matDescr;
2563 #endif
2564   } else { /* we need to use row offsets for the full matrix */
2565     CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat;
2566     Bcsr = new CsrMatrix;
2567     Bcsr->num_rows       = B->rmap->n;
2568     Bcsr->num_cols       = cBcsr->num_cols;
2569     Bcsr->num_entries    = cBcsr->num_entries;
2570     Bcsr->column_indices = cBcsr->column_indices;
2571     Bcsr->values         = cBcsr->values;
2572     if (!Bcusp->rowoffsets_gpu) {
2573       Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
2574       Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
2575       PetscCall(PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt)));
2576     }
2577     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2578     mmdata->Bcsr = Bcsr;
2579 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2580     if (Bcsr->num_rows && Bcsr->num_cols) {
2581       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries,
2582                                Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2583                                Bcsr->values->data().get(),
2584                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2585                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat);
2586     }
2587     BmatSpDescr = mmdata->matSpBDescr;
2588 #endif
2589   }
2590   PetscCheck(Acsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
2591   PetscCheck(Bcsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
2592   /* precompute flops count */
2593   if (ptype == MATPRODUCT_AB) {
2594     for (i=0, flops = 0; i<A->rmap->n; i++) {
2595       const PetscInt st = a->i[i];
2596       const PetscInt en = a->i[i+1];
2597       for (j=st; j<en; j++) {
2598         const PetscInt brow = a->j[j];
2599         flops += 2.*(b->i[brow+1] - b->i[brow]);
2600       }
2601     }
2602   } else if (ptype == MATPRODUCT_AtB) {
2603     for (i=0, flops = 0; i<A->rmap->n; i++) {
2604       const PetscInt anzi = a->i[i+1] - a->i[i];
2605       const PetscInt bnzi = b->i[i+1] - b->i[i];
2606       flops += (2.*anzi)*bnzi;
2607     }
2608   } else { /* TODO */
2609     flops = 0.;
2610   }
2611 
2612   mmdata->flops = flops;
2613   PetscCall(PetscLogGpuTimeBegin());
2614 
2615 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2616   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2617   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0,
2618                           NULL, NULL, NULL,
2619                           CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2620                           CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat);
2621   PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc));
2622  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2623  {
2624   /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
2625      We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
2626   */
2627   void*  dBuffer1 = NULL;
2628   void*  dBuffer2 = NULL;
2629   void*  dBuffer3 = NULL;
2630   /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
2631   size_t bufferSize1 = 0;
2632   size_t bufferSize2 = 0;
2633   size_t bufferSize3 = 0;
2634   size_t bufferSize4 = 0;
2635   size_t bufferSize5 = 0;
2636 
2637   /*----------------------------------------------------------------------*/
2638   /* ask bufferSize1 bytes for external memory */
2639   stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2640                                             CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2641                                             &bufferSize1, NULL);PetscCallCUSPARSE(stat);
2642   PetscCallCUDA(cudaMalloc((void**) &dBuffer1, bufferSize1));
2643   /* inspect the matrices A and B to understand the memory requirement for the next step */
2644   stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2645                                             CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2646                                             &bufferSize1, dBuffer1);PetscCallCUSPARSE(stat);
2647 
2648   /*----------------------------------------------------------------------*/
2649   stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2650                                  CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2651                                  &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);PetscCallCUSPARSE(stat);
2652   PetscCallCUDA(cudaMalloc((void**) &dBuffer2, bufferSize2));
2653   PetscCallCUDA(cudaMalloc((void**) &dBuffer3, bufferSize3));
2654   PetscCallCUDA(cudaMalloc((void**) &mmdata->dBuffer4, bufferSize4));
2655   stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2656                                  CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2657                                  &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);PetscCallCUSPARSE(stat);
2658   PetscCallCUDA(cudaFree(dBuffer1));
2659   PetscCallCUDA(cudaFree(dBuffer2));
2660 
2661   /*----------------------------------------------------------------------*/
2662   /* get matrix C non-zero entries C_nnz1 */
2663   PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
2664   c->nz = (PetscInt) C_nnz1;
2665   /* allocate matrix C */
2666   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2667   Ccsr->values         = new THRUSTARRAY(c->nz);PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2668   /* update matC with the new pointers */
2669   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2670                                 Ccsr->values->data().get());PetscCallCUSPARSE(stat);
2671 
2672   /*----------------------------------------------------------------------*/
2673   stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2674                                   CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2675                                   &bufferSize5, NULL);PetscCallCUSPARSE(stat);
2676   PetscCallCUDA(cudaMalloc((void**) &mmdata->dBuffer5, bufferSize5));
2677   stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2678                                   CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2679                                   &bufferSize5, mmdata->dBuffer5);PetscCallCUSPARSE(stat);
2680   PetscCallCUDA(cudaFree(dBuffer3));
2681   stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB,
2682                                      Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2683                                      cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2684                                      mmdata->spgemmDesc);PetscCallCUSPARSE(stat);
2685   PetscCall(PetscInfo(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufferSize4/1024,bufferSize5/1024));
2686  }
2687  #else
2688   size_t bufSize2;
2689   /* ask bufferSize bytes for external memory */
2690   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB,
2691                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2692                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2693                                        mmdata->spgemmDesc, &bufSize2, NULL);PetscCallCUSPARSE(stat);
2694   PetscCallCUDA(cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2));
2695   /* inspect the matrices A and B to understand the memory requirement for the next step */
2696   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB,
2697                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2698                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2699                                        mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);PetscCallCUSPARSE(stat);
2700   /* ask bufferSize again bytes for external memory */
2701   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2702                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2703                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2704                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);PetscCallCUSPARSE(stat);
2705   /* The CUSPARSE documentation is not clear, nor the API
2706      We need both buffers to perform the operations properly!
2707      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2708      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2709      is stored in the descriptor! What a messy API... */
2710   PetscCallCUDA(cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize));
2711   /* compute the intermediate product of A * B */
2712   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2713                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2714                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2715                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);PetscCallCUSPARSE(stat);
2716   /* get matrix C non-zero entries C_nnz1 */
2717   PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
2718   c->nz = (PetscInt) C_nnz1;
2719   PetscCall(PetscInfo(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024));
2720   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2721   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2722   Ccsr->values = new THRUSTARRAY(c->nz);
2723   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2724   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2725                                 Ccsr->values->data().get());PetscCallCUSPARSE(stat);
2726   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB,
2727                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2728                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);PetscCallCUSPARSE(stat);
2729  #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2730 #else
2731   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST));
2732   stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB,
2733                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2734                              Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2735                              Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2736                              Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);PetscCallCUSPARSE(stat);
2737   c->nz = cnz;
2738   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2739   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2740   Ccsr->values = new THRUSTARRAY(c->nz);
2741   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2742 
2743   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2744   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
2745      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
2746      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
2747   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB,
2748                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2749                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2750                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2751                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());PetscCallCUSPARSE(stat);
2752 #endif
2753   PetscCall(PetscLogGpuFlops(mmdata->flops));
2754   PetscCall(PetscLogGpuTimeEnd());
2755 finalizesym:
2756   c->singlemalloc = PETSC_FALSE;
2757   c->free_a       = PETSC_TRUE;
2758   c->free_ij      = PETSC_TRUE;
2759   PetscCall(PetscMalloc1(m+1,&c->i));
2760   PetscCall(PetscMalloc1(c->nz,&c->j));
2761   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
2762     PetscInt *d_i = c->i;
2763     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
2764     THRUSTINTARRAY jj(Ccsr->column_indices->size());
2765     ii   = *Ccsr->row_offsets;
2766     jj   = *Ccsr->column_indices;
2767     if (ciscompressed) d_i = c->compressedrow.i;
2768     PetscCallCUDA(cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
2769     PetscCallCUDA(cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
2770   } else {
2771     PetscInt *d_i = c->i;
2772     if (ciscompressed) d_i = c->compressedrow.i;
2773     PetscCallCUDA(cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
2774     PetscCallCUDA(cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
2775   }
2776   if (ciscompressed) { /* need to expand host row offsets */
2777     PetscInt r = 0;
2778     c->i[0] = 0;
2779     for (k = 0; k < c->compressedrow.nrows; k++) {
2780       const PetscInt next = c->compressedrow.rindex[k];
2781       const PetscInt old = c->compressedrow.i[k];
2782       for (; r < next; r++) c->i[r+1] = old;
2783     }
2784     for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows];
2785   }
2786   PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt)));
2787   PetscCall(PetscMalloc1(m,&c->ilen));
2788   PetscCall(PetscMalloc1(m,&c->imax));
2789   c->maxnz = c->nz;
2790   c->nonzerorowcnt = 0;
2791   c->rmax = 0;
2792   for (k = 0; k < m; k++) {
2793     const PetscInt nn = c->i[k+1] - c->i[k];
2794     c->ilen[k] = c->imax[k] = nn;
2795     c->nonzerorowcnt += (PetscInt)!!nn;
2796     c->rmax = PetscMax(c->rmax,nn);
2797   }
2798   PetscCall(MatMarkDiagonal_SeqAIJ(C));
2799   PetscCall(PetscMalloc1(c->nz,&c->a));
2800   Ccsr->num_entries = c->nz;
2801 
2802   C->nonzerostate++;
2803   PetscCall(PetscLayoutSetUp(C->rmap));
2804   PetscCall(PetscLayoutSetUp(C->cmap));
2805   Ccusp->nonzerostate = C->nonzerostate;
2806   C->offloadmask   = PETSC_OFFLOAD_UNALLOCATED;
2807   C->preallocated  = PETSC_TRUE;
2808   C->assembled     = PETSC_FALSE;
2809   C->was_assembled = PETSC_FALSE;
2810   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
2811     mmdata->reusesym = PETSC_TRUE;
2812     C->offloadmask   = PETSC_OFFLOAD_GPU;
2813   }
2814   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2815   PetscFunctionReturn(0);
2816 }
2817 
2818 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
2819 
2820 /* handles sparse or dense B */
2821 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
2822 {
2823   Mat_Product    *product = mat->product;
2824   PetscBool      isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE;
2825 
2826   PetscFunctionBegin;
2827   MatCheckProduct(mat,1);
2828   PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense));
2829   if (!product->A->boundtocpu && !product->B->boundtocpu) {
2830     PetscCall(PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp));
2831   }
2832   if (product->type == MATPRODUCT_ABC) {
2833     Ciscusp = PETSC_FALSE;
2834     if (!product->C->boundtocpu) {
2835       PetscCall(PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp));
2836     }
2837   }
2838   if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
2839     PetscBool usecpu = PETSC_FALSE;
2840     switch (product->type) {
2841     case MATPRODUCT_AB:
2842       if (product->api_user) {
2843         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMult","Mat");
2844         PetscCall(PetscOptionsBool("-matmatmult_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL));
2845         PetscOptionsEnd();
2846       } else {
2847         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AB","Mat");
2848         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL));
2849         PetscOptionsEnd();
2850       }
2851       break;
2852     case MATPRODUCT_AtB:
2853       if (product->api_user) {
2854         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatTransposeMatMult","Mat");
2855         PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL));
2856         PetscOptionsEnd();
2857       } else {
2858         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AtB","Mat");
2859         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL));
2860         PetscOptionsEnd();
2861       }
2862       break;
2863     case MATPRODUCT_PtAP:
2864       if (product->api_user) {
2865         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatPtAP","Mat");
2866         PetscCall(PetscOptionsBool("-matptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL));
2867         PetscOptionsEnd();
2868       } else {
2869         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_PtAP","Mat");
2870         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL));
2871         PetscOptionsEnd();
2872       }
2873       break;
2874     case MATPRODUCT_RARt:
2875       if (product->api_user) {
2876         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatRARt","Mat");
2877         PetscCall(PetscOptionsBool("-matrart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL));
2878         PetscOptionsEnd();
2879       } else {
2880         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_RARt","Mat");
2881         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL));
2882         PetscOptionsEnd();
2883       }
2884       break;
2885     case MATPRODUCT_ABC:
2886       if (product->api_user) {
2887         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMatMult","Mat");
2888         PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL));
2889         PetscOptionsEnd();
2890       } else {
2891         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_ABC","Mat");
2892         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL));
2893         PetscOptionsEnd();
2894       }
2895       break;
2896     default:
2897       break;
2898     }
2899     if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
2900   }
2901   /* dispatch */
2902   if (isdense) {
2903     switch (product->type) {
2904     case MATPRODUCT_AB:
2905     case MATPRODUCT_AtB:
2906     case MATPRODUCT_ABt:
2907     case MATPRODUCT_PtAP:
2908     case MATPRODUCT_RARt:
2909      if (product->A->boundtocpu) {
2910         PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat));
2911       } else {
2912         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2913       }
2914       break;
2915     case MATPRODUCT_ABC:
2916       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2917       break;
2918     default:
2919       break;
2920     }
2921   } else if (Biscusp && Ciscusp) {
2922     switch (product->type) {
2923     case MATPRODUCT_AB:
2924     case MATPRODUCT_AtB:
2925     case MATPRODUCT_ABt:
2926       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2927       break;
2928     case MATPRODUCT_PtAP:
2929     case MATPRODUCT_RARt:
2930     case MATPRODUCT_ABC:
2931       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2932       break;
2933     default:
2934       break;
2935     }
2936   } else { /* fallback for AIJ */
2937     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
2938   }
2939   PetscFunctionReturn(0);
2940 }
2941 
2942 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2943 {
2944   PetscFunctionBegin;
2945   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE));
2946   PetscFunctionReturn(0);
2947 }
2948 
2949 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
2950 {
2951   PetscFunctionBegin;
2952   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE));
2953   PetscFunctionReturn(0);
2954 }
2955 
2956 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2957 {
2958   PetscFunctionBegin;
2959   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE));
2960   PetscFunctionReturn(0);
2961 }
2962 
2963 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2964 {
2965   PetscFunctionBegin;
2966   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE));
2967   PetscFunctionReturn(0);
2968 }
2969 
2970 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2971 {
2972   PetscFunctionBegin;
2973   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE));
2974   PetscFunctionReturn(0);
2975 }
2976 
2977 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y)
2978 {
2979   int i = blockIdx.x*blockDim.x + threadIdx.x;
2980   if (i < n) y[idx[i]] += x[i];
2981 }
2982 
2983 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
2984 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
2985 {
2986   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
2987   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
2988   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
2989   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
2990   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2991   PetscBool                    compressed;
2992 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2993   PetscInt                     nx,ny;
2994 #endif
2995 
2996   PetscFunctionBegin;
2997   PetscCheck(!herm || trans,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Hermitian and not transpose not supported");
2998   if (!a->nz) {
2999     if (!yy) PetscCall(VecSet_SeqCUDA(zz,0));
3000     else PetscCall(VecCopy_SeqCUDA(yy,zz));
3001     PetscFunctionReturn(0);
3002   }
3003   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
3004   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3005   if (!trans) {
3006     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
3007     PetscCheck(matstruct,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
3008   } else {
3009     if (herm || !A->form_explicit_transpose) {
3010       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
3011       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
3012     } else {
3013       if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3014       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
3015     }
3016   }
3017   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3018   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
3019 
3020   try {
3021     PetscCall(VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray));
3022     if (yy == zz) PetscCall(VecCUDAGetArray(zz,&zarray)); /* read & write zz, so need to get uptodate zarray on GPU */
3023     else PetscCall(VecCUDAGetArrayWrite(zz,&zarray)); /* write zz, so no need to init zarray on GPU */
3024 
3025     PetscCall(PetscLogGpuTimeBegin());
3026     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3027       /* z = A x + beta y.
3028          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3029          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3030       */
3031       xptr = xarray;
3032       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
3033       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3034      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3035       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3036           allocated to accommodate different uses. So we get the length info directly from mat.
3037        */
3038       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3039         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3040         nx = mat->num_cols;
3041         ny = mat->num_rows;
3042       }
3043      #endif
3044     } else {
3045       /* z = A^T x + beta y
3046          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3047          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3048        */
3049       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
3050       dptr = zarray;
3051       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3052       if (compressed) { /* Scatter x to work vector */
3053         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3054         thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
3055                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3056                          VecCUDAEqualsReverse());
3057       }
3058      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3059       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3060         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3061         nx = mat->num_rows;
3062         ny = mat->num_cols;
3063       }
3064      #endif
3065     }
3066 
3067     /* csr_spmv does y = alpha op(A) x + beta y */
3068     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3069      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3070       PetscCheck(opA >= 0 && opA <= 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
3071       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
3072         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype));
3073         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype));
3074         PetscCallCUSPARSE(cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
3075                                                matstruct->matDescr,
3076                                                matstruct->cuSpMV[opA].vecXDescr, beta,
3077                                                matstruct->cuSpMV[opA].vecYDescr,
3078                                                cusparse_scalartype,
3079                                                cusparsestruct->spmvAlg,
3080                                                &matstruct->cuSpMV[opA].spmvBufferSize));
3081         PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize));
3082 
3083         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3084       } else {
3085         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
3086         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr));
3087         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr));
3088       }
3089 
3090       PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA,
3091                                   matstruct->alpha_one,
3092                                   matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */
3093                                   matstruct->cuSpMV[opA].vecXDescr,
3094                                   beta,
3095                                   matstruct->cuSpMV[opA].vecYDescr,
3096                                   cusparse_scalartype,
3097                                   cusparsestruct->spmvAlg,
3098                                   matstruct->cuSpMV[opA].spmvBuffer));
3099      #else
3100       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3101       PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA,
3102                                        mat->num_rows, mat->num_cols,
3103                                        mat->num_entries, matstruct->alpha_one, matstruct->descr,
3104                                        mat->values->data().get(), mat->row_offsets->data().get(),
3105                                        mat->column_indices->data().get(), xptr, beta,
3106                                        dptr));
3107      #endif
3108     } else {
3109       if (cusparsestruct->nrows) {
3110        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3111         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3112        #else
3113         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
3114         PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA,
3115                                          matstruct->alpha_one, matstruct->descr, hybMat,
3116                                          xptr, beta,
3117                                          dptr));
3118        #endif
3119       }
3120     }
3121     PetscCall(PetscLogGpuTimeEnd());
3122 
3123     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3124       if (yy) { /* MatMultAdd: zz = A*xx + yy */
3125         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3126           PetscCall(VecCopy_SeqCUDA(yy,zz)); /* zz = yy */
3127         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3128           PetscCall(VecAXPY_SeqCUDA(zz,1.0,yy)); /* zz += yy */
3129         }
3130       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3131         PetscCall(VecSet_SeqCUDA(zz,0));
3132       }
3133 
3134       /* ScatterAdd the result from work vector into the full vector when A is compressed */
3135       if (compressed) {
3136         PetscCall(PetscLogGpuTimeBegin());
3137         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred)
3138            and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
3139            prevent that. So I just add a ScatterAdd kernel.
3140          */
3141        #if 0
3142         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
3143         thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
3144                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
3145                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3146                          VecCUDAPlusEquals());
3147        #else
3148         PetscInt n = matstruct->cprowIndices->size();
3149         ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray);
3150        #endif
3151         PetscCall(PetscLogGpuTimeEnd());
3152       }
3153     } else {
3154       if (yy && yy != zz) {
3155         PetscCall(VecAXPY_SeqCUDA(zz,1.0,yy)); /* zz += yy */
3156       }
3157     }
3158     PetscCall(VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray));
3159     if (yy == zz) PetscCall(VecCUDARestoreArray(zz,&zarray));
3160     else PetscCall(VecCUDARestoreArrayWrite(zz,&zarray));
3161   } catch(char *ex) {
3162     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
3163   }
3164   if (yy) {
3165     PetscCall(PetscLogGpuFlops(2.0*a->nz));
3166   } else {
3167     PetscCall(PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt));
3168   }
3169   PetscFunctionReturn(0);
3170 }
3171 
3172 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
3173 {
3174   PetscFunctionBegin;
3175   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE));
3176   PetscFunctionReturn(0);
3177 }
3178 
3179 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
3180 {
3181   PetscObjectState   onnz = A->nonzerostate;
3182   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3183 
3184   PetscFunctionBegin;
3185   PetscCall(MatAssemblyEnd_SeqAIJ(A,mode));
3186   if (onnz != A->nonzerostate && cusp->deviceMat) {
3187 
3188     PetscCall(PetscInfo(A,"Destroy device mat since nonzerostate changed\n"));
3189     PetscCallCUDA(cudaFree(cusp->deviceMat));
3190     cusp->deviceMat = NULL;
3191   }
3192   PetscFunctionReturn(0);
3193 }
3194 
3195 /* --------------------------------------------------------------------------------*/
3196 /*@
3197    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
3198    (the default parallel PETSc format). This matrix will ultimately pushed down
3199    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
3200    assembly performance the user should preallocate the matrix storage by setting
3201    the parameter nz (or the array nnz).  By setting these parameters accurately,
3202    performance during matrix assembly can be increased by more than a factor of 50.
3203 
3204    Collective
3205 
3206    Input Parameters:
3207 +  comm - MPI communicator, set to PETSC_COMM_SELF
3208 .  m - number of rows
3209 .  n - number of columns
3210 .  nz - number of nonzeros per row (same for all rows)
3211 -  nnz - array containing the number of nonzeros in the various rows
3212          (possibly different for each row) or NULL
3213 
3214    Output Parameter:
3215 .  A - the matrix
3216 
3217    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
3218    MatXXXXSetPreallocation() paradgm instead of this routine directly.
3219    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
3220 
3221    Notes:
3222    If nnz is given then nz is ignored
3223 
3224    The AIJ format (also called the Yale sparse matrix format or
3225    compressed row storage), is fully compatible with standard Fortran 77
3226    storage.  That is, the stored row and column indices can begin at
3227    either one (as in Fortran) or zero.  See the users' manual for details.
3228 
3229    Specify the preallocated storage with either nz or nnz (not both).
3230    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
3231    allocation.  For large problems you MUST preallocate memory or you
3232    will get TERRIBLE performance, see the users' manual chapter on matrices.
3233 
3234    By default, this format uses inodes (identical nodes) when possible, to
3235    improve numerical efficiency of matrix-vector products and solves. We
3236    search for consecutive rows with the same nonzero structure, thereby
3237    reusing matrix information to achieve increased efficiency.
3238 
3239    Level: intermediate
3240 
3241 .seealso: `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATSEQAIJCUSPARSE`, `MATAIJCUSPARSE`
3242 @*/
3243 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
3244 {
3245   PetscFunctionBegin;
3246   PetscCall(MatCreate(comm,A));
3247   PetscCall(MatSetSizes(*A,m,n,m,n));
3248   PetscCall(MatSetType(*A,MATSEQAIJCUSPARSE));
3249   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz));
3250   PetscFunctionReturn(0);
3251 }
3252 
3253 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3254 {
3255   PetscFunctionBegin;
3256   if (A->factortype == MAT_FACTOR_NONE) {
3257     PetscCall(MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr));
3258   } else {
3259     PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr));
3260   }
3261   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL));
3262   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL));
3263   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetUseCPUSolve_C",NULL));
3264   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL));
3265   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL));
3266   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL));
3267   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL));
3268   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL));
3269   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL));
3270   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaijcusparse_hypre_C",NULL));
3271   PetscCall(MatDestroy_SeqAIJ(A));
3272   PetscFunctionReturn(0);
3273 }
3274 
3275 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
3276 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
3277 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
3278 {
3279   PetscFunctionBegin;
3280   PetscCall(MatDuplicate_SeqAIJ(A,cpvalues,B));
3281   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B));
3282   PetscFunctionReturn(0);
3283 }
3284 
3285 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str)
3286 {
3287   Mat_SeqAIJ         *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
3288   Mat_SeqAIJCUSPARSE *cy;
3289   Mat_SeqAIJCUSPARSE *cx;
3290   PetscScalar        *ay;
3291   const PetscScalar  *ax;
3292   CsrMatrix          *csry,*csrx;
3293 
3294   PetscFunctionBegin;
3295   cy = (Mat_SeqAIJCUSPARSE*)Y->spptr;
3296   cx = (Mat_SeqAIJCUSPARSE*)X->spptr;
3297   if (X->ops->axpy != Y->ops->axpy) {
3298     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE));
3299     PetscCall(MatAXPY_SeqAIJ(Y,a,X,str));
3300     PetscFunctionReturn(0);
3301   }
3302   /* if we are here, it means both matrices are bound to GPU */
3303   PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y));
3304   PetscCall(MatSeqAIJCUSPARSECopyToGPU(X));
3305   PetscCheck(cy->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)Y),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
3306   PetscCheck(cx->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)X),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
3307   csry = (CsrMatrix*)cy->mat->mat;
3308   csrx = (CsrMatrix*)cx->mat->mat;
3309   /* see if we can turn this into a cublas axpy */
3310   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3311     bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin());
3312     if (eq) {
3313       eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin());
3314     }
3315     if (eq) str = SAME_NONZERO_PATTERN;
3316   }
3317   /* spgeam is buggy with one column */
3318   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3319 
3320   if (str == SUBSET_NONZERO_PATTERN) {
3321     PetscScalar b = 1.0;
3322 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3323     size_t      bufferSize;
3324     void        *buffer;
3325 #endif
3326 
3327     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X,&ax));
3328     PetscCall(MatSeqAIJCUSPARSEGetArray(Y,&ay));
3329     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST));
3330 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3331     PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n,
3332                                                   &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3333                                                   &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3334                                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize));
3335     PetscCallCUDA(cudaMalloc(&buffer,bufferSize));
3336     PetscCall(PetscLogGpuTimeBegin());
3337     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3338                                        &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3339                                        &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3340                                        cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer));
3341     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3342     PetscCall(PetscLogGpuTimeEnd());
3343     PetscCallCUDA(cudaFree(buffer));
3344 #else
3345     PetscCall(PetscLogGpuTimeBegin());
3346     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3347                                        &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3348                                        &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3349                                        cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get()));
3350     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
3351     PetscCall(PetscLogGpuTimeEnd());
3352 #endif
3353     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE));
3354     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X,&ax));
3355     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y,&ay));
3356     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3357   } else if (str == SAME_NONZERO_PATTERN) {
3358     cublasHandle_t cublasv2handle;
3359     PetscBLASInt   one = 1, bnz = 1;
3360 
3361     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X,&ax));
3362     PetscCall(MatSeqAIJCUSPARSEGetArray(Y,&ay));
3363     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
3364     PetscCall(PetscBLASIntCast(x->nz,&bnz));
3365     PetscCall(PetscLogGpuTimeBegin());
3366     PetscCallCUBLAS(cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one));
3367     PetscCall(PetscLogGpuFlops(2.0*bnz));
3368     PetscCall(PetscLogGpuTimeEnd());
3369     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X,&ax));
3370     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y,&ay));
3371     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3372   } else {
3373     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE));
3374     PetscCall(MatAXPY_SeqAIJ(Y,a,X,str));
3375   }
3376   PetscFunctionReturn(0);
3377 }
3378 
3379 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a)
3380 {
3381   Mat_SeqAIJ     *y = (Mat_SeqAIJ*)Y->data;
3382   PetscScalar    *ay;
3383   cublasHandle_t cublasv2handle;
3384   PetscBLASInt   one = 1, bnz = 1;
3385 
3386   PetscFunctionBegin;
3387   PetscCall(MatSeqAIJCUSPARSEGetArray(Y,&ay));
3388   PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
3389   PetscCall(PetscBLASIntCast(y->nz,&bnz));
3390   PetscCall(PetscLogGpuTimeBegin());
3391   PetscCallCUBLAS(cublasXscal(cublasv2handle,bnz,&a,ay,one));
3392   PetscCall(PetscLogGpuFlops(bnz));
3393   PetscCall(PetscLogGpuTimeEnd());
3394   PetscCall(MatSeqAIJCUSPARSERestoreArray(Y,&ay));
3395   PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3396   PetscFunctionReturn(0);
3397 }
3398 
3399 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3400 {
3401   PetscBool      both = PETSC_FALSE;
3402   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3403 
3404   PetscFunctionBegin;
3405   if (A->factortype == MAT_FACTOR_NONE) {
3406     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
3407     if (spptr->mat) {
3408       CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat;
3409       if (matrix->values) {
3410         both = PETSC_TRUE;
3411         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3412       }
3413     }
3414     if (spptr->matTranspose) {
3415       CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat;
3416       if (matrix->values) {
3417         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3418       }
3419     }
3420   }
3421   PetscCall(PetscArrayzero(a->a,a->i[A->rmap->n]));
3422   PetscCall(MatSeqAIJInvalidateDiagonal(A));
3423   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3424   else A->offloadmask = PETSC_OFFLOAD_CPU;
3425   PetscFunctionReturn(0);
3426 }
3427 
3428 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
3429 {
3430   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3431 
3432   PetscFunctionBegin;
3433   if (A->factortype != MAT_FACTOR_NONE) {
3434     A->boundtocpu = flg;
3435     PetscFunctionReturn(0);
3436   }
3437   if (flg) {
3438     PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
3439 
3440     A->ops->scale                     = MatScale_SeqAIJ;
3441     A->ops->axpy                      = MatAXPY_SeqAIJ;
3442     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3443     A->ops->mult                      = MatMult_SeqAIJ;
3444     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3445     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3446     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3447     A->ops->multhermitiantranspose    = NULL;
3448     A->ops->multhermitiantransposeadd = NULL;
3449     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
3450     PetscCall(PetscMemzero(a->ops,sizeof(Mat_SeqAIJOps)));
3451     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL));
3452     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL));
3453     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL));
3454     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL));
3455     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL));
3456     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL));
3457   } else {
3458     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
3459     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3460     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3461     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
3462     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
3463     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
3464     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
3465     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3466     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3467     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
3468     a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJCUSPARSE;
3469     a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJCUSPARSE;
3470     a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE;
3471     a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE;
3472     a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE;
3473     a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE;
3474     a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE;
3475 
3476     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE));
3477     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE));
3478     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE));
3479     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE));
3480     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE));
3481     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE));
3482    }
3483   A->boundtocpu = flg;
3484   if (flg && a->inode.size) {
3485     a->inode.use = PETSC_TRUE;
3486   } else {
3487     a->inode.use = PETSC_FALSE;
3488   }
3489   PetscFunctionReturn(0);
3490 }
3491 
3492 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
3493 {
3494   Mat              B;
3495 
3496   PetscFunctionBegin;
3497   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */
3498   if (reuse == MAT_INITIAL_MATRIX) {
3499     PetscCall(MatDuplicate(A,MAT_COPY_VALUES,newmat));
3500   } else if (reuse == MAT_REUSE_MATRIX) {
3501     PetscCall(MatCopy(A,*newmat,SAME_NONZERO_PATTERN));
3502   }
3503   B = *newmat;
3504 
3505   PetscCall(PetscFree(B->defaultvectype));
3506   PetscCall(PetscStrallocpy(VECCUDA,&B->defaultvectype));
3507 
3508   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
3509     if (B->factortype == MAT_FACTOR_NONE) {
3510       Mat_SeqAIJCUSPARSE *spptr;
3511       PetscCall(PetscNew(&spptr));
3512       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
3513       PetscCallCUSPARSE(cusparseSetStream(spptr->handle,PetscDefaultCudaStream));
3514       spptr->format     = MAT_CUSPARSE_CSR;
3515      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3516      #if CUSPARSE_VERSION > 11301
3517       spptr->spmvAlg    = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
3518      #else
3519       spptr->spmvAlg    = CUSPARSE_CSRMV_ALG1;    /* default, since we only support csr */
3520      #endif
3521       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
3522       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
3523      #endif
3524       B->spptr = spptr;
3525     } else {
3526       Mat_SeqAIJCUSPARSETriFactors *spptr;
3527 
3528       PetscCall(PetscNew(&spptr));
3529       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
3530       PetscCallCUSPARSE(cusparseSetStream(spptr->handle,PetscDefaultCudaStream));
3531       B->spptr = spptr;
3532     }
3533     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
3534   }
3535   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
3536   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
3537   B->ops->setoption      = MatSetOption_SeqAIJCUSPARSE;
3538   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
3539   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
3540   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
3541 
3542   PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE));
3543   PetscCall(PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE));
3544   PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE));
3545 #if defined(PETSC_HAVE_HYPRE)
3546   PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijcusparse_hypre_C",MatConvert_AIJ_HYPRE));
3547 #endif
3548   PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetUseCPUSolve_C",MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE));
3549   PetscFunctionReturn(0);
3550 }
3551 
3552 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
3553 {
3554   PetscFunctionBegin;
3555   PetscCall(MatCreate_SeqAIJ(B));
3556   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B));
3557   PetscFunctionReturn(0);
3558 }
3559 
3560 /*MC
3561    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
3562 
3563    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
3564    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
3565    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
3566 
3567    Options Database Keys:
3568 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
3569 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3570 -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3571 +  -mat_cusparse_use_cpu_solve - Do MatSolve on CPU
3572 
3573   Level: beginner
3574 
3575 .seealso: `MatCreateSeqAIJCUSPARSE()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
3576 M*/
3577 
3578 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*);
3579 
3580 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
3581 {
3582   PetscFunctionBegin;
3583   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSEBAND,MATSEQAIJ,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band));
3584   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse));
3585   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse));
3586   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse));
3587   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse));
3588 
3589   PetscFunctionReturn(0);
3590 }
3591 
3592 static PetscErrorCode MatResetPreallocationCOO_SeqAIJCUSPARSE(Mat mat)
3593 {
3594   Mat_SeqAIJCUSPARSE* cusp = (Mat_SeqAIJCUSPARSE*)mat->spptr;
3595 
3596   PetscFunctionBegin;
3597   if (!cusp) PetscFunctionReturn(0);
3598   delete cusp->cooPerm;
3599   delete cusp->cooPerm_a;
3600   cusp->cooPerm = NULL;
3601   cusp->cooPerm_a = NULL;
3602   if (cusp->use_extended_coo) {
3603     PetscCallCUDA(cudaFree(cusp->jmap_d));
3604     PetscCallCUDA(cudaFree(cusp->perm_d));
3605   }
3606   cusp->use_extended_coo = PETSC_FALSE;
3607   PetscFunctionReturn(0);
3608 }
3609 
3610 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
3611 {
3612   PetscFunctionBegin;
3613   if (*cusparsestruct) {
3614     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format));
3615     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format));
3616     delete (*cusparsestruct)->workVector;
3617     delete (*cusparsestruct)->rowoffsets_gpu;
3618     delete (*cusparsestruct)->cooPerm;
3619     delete (*cusparsestruct)->cooPerm_a;
3620     delete (*cusparsestruct)->csr2csc_i;
3621     if ((*cusparsestruct)->handle) PetscCallCUSPARSE(cusparseDestroy((*cusparsestruct)->handle));
3622     if ((*cusparsestruct)->jmap_d) PetscCallCUDA(cudaFree((*cusparsestruct)->jmap_d));
3623     if ((*cusparsestruct)->perm_d) PetscCallCUDA(cudaFree((*cusparsestruct)->perm_d));
3624     PetscCall(PetscFree(*cusparsestruct));
3625   }
3626   PetscFunctionReturn(0);
3627 }
3628 
3629 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
3630 {
3631   PetscFunctionBegin;
3632   if (*mat) {
3633     delete (*mat)->values;
3634     delete (*mat)->column_indices;
3635     delete (*mat)->row_offsets;
3636     delete *mat;
3637     *mat = 0;
3638   }
3639   PetscFunctionReturn(0);
3640 }
3641 
3642 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
3643 {
3644   PetscFunctionBegin;
3645   if (*trifactor) {
3646     if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr));
3647     if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparseDestroyCsrsvInfo((*trifactor)->solveInfo));
3648     PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat));
3649     if ((*trifactor)->solveBuffer)   PetscCallCUDA(cudaFree((*trifactor)->solveBuffer));
3650     if ((*trifactor)->AA_h)   PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h));
3651    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3652     if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer));
3653    #endif
3654     PetscCall(PetscFree(*trifactor));
3655   }
3656   PetscFunctionReturn(0);
3657 }
3658 
3659 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
3660 {
3661   CsrMatrix        *mat;
3662 
3663   PetscFunctionBegin;
3664   if (*matstruct) {
3665     if ((*matstruct)->mat) {
3666       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
3667        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3668         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3669        #else
3670         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
3671         PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat));
3672        #endif
3673       } else {
3674         mat = (CsrMatrix*)(*matstruct)->mat;
3675         CsrMatrix_Destroy(&mat);
3676       }
3677     }
3678     if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr));
3679     delete (*matstruct)->cprowIndices;
3680     if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one));
3681     if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero));
3682     if ((*matstruct)->beta_one)  PetscCallCUDA(cudaFree((*matstruct)->beta_one));
3683 
3684    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3685     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
3686     if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr));
3687     for (int i=0; i<3; i++) {
3688       if (mdata->cuSpMV[i].initialized) {
3689         PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer));
3690         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr));
3691         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr));
3692       }
3693     }
3694    #endif
3695     delete *matstruct;
3696     *matstruct = NULL;
3697   }
3698   PetscFunctionReturn(0);
3699 }
3700 
3701 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p* trifactors)
3702 {
3703   PetscFunctionBegin;
3704   if (*trifactors) {
3705     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr));
3706     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr));
3707     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose));
3708     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose));
3709     delete (*trifactors)->rpermIndices;
3710     delete (*trifactors)->cpermIndices;
3711     delete (*trifactors)->workVector;
3712     (*trifactors)->rpermIndices = NULL;
3713     (*trifactors)->cpermIndices = NULL;
3714     (*trifactors)->workVector = NULL;
3715     if ((*trifactors)->a_band_d)   PetscCallCUDA(cudaFree((*trifactors)->a_band_d));
3716     if ((*trifactors)->i_band_d)   PetscCallCUDA(cudaFree((*trifactors)->i_band_d));
3717     (*trifactors)->init_dev_prop = PETSC_FALSE;
3718   }
3719   PetscFunctionReturn(0);
3720 }
3721 
3722 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
3723 {
3724   cusparseHandle_t handle;
3725 
3726   PetscFunctionBegin;
3727   if (*trifactors) {
3728     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors));
3729     if (handle = (*trifactors)->handle) {
3730       PetscCallCUSPARSE(cusparseDestroy(handle));
3731     }
3732     PetscCall(PetscFree(*trifactors));
3733   }
3734   PetscFunctionReturn(0);
3735 }
3736 
3737 struct IJCompare
3738 {
3739   __host__ __device__
3740   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3741   {
3742     if (t1.get<0>() < t2.get<0>()) return true;
3743     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3744     return false;
3745   }
3746 };
3747 
3748 struct IJEqual
3749 {
3750   __host__ __device__
3751   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3752   {
3753     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
3754     return true;
3755   }
3756 };
3757 
3758 struct IJDiff
3759 {
3760   __host__ __device__
3761   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3762   {
3763     return t1 == t2 ? 0 : 1;
3764   }
3765 };
3766 
3767 struct IJSum
3768 {
3769   __host__ __device__
3770   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3771   {
3772     return t1||t2;
3773   }
3774 };
3775 
3776 #include <thrust/iterator/discard_iterator.h>
3777 /* Associated with MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic() */
3778 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode)
3779 {
3780   Mat_SeqAIJCUSPARSE                    *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3781   Mat_SeqAIJ                            *a = (Mat_SeqAIJ*)A->data;
3782   THRUSTARRAY                           *cooPerm_v = NULL;
3783   thrust::device_ptr<const PetscScalar> d_v;
3784   CsrMatrix                             *matrix;
3785   PetscInt                              n;
3786 
3787   PetscFunctionBegin;
3788   PetscCheck(cusp,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct");
3789   PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix");
3790   if (!cusp->cooPerm) {
3791     PetscCall(MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY));
3792     PetscCall(MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY));
3793     PetscFunctionReturn(0);
3794   }
3795   matrix = (CsrMatrix*)cusp->mat->mat;
3796   PetscCheck(matrix->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3797   if (!v) {
3798     if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3799     goto finalize;
3800   }
3801   n = cusp->cooPerm->size();
3802   if (isCudaMem(v)) {
3803     d_v = thrust::device_pointer_cast(v);
3804   } else {
3805     cooPerm_v = new THRUSTARRAY(n);
3806     cooPerm_v->assign(v,v+n);
3807     d_v = cooPerm_v->data();
3808     PetscCall(PetscLogCpuToGpu(n*sizeof(PetscScalar)));
3809   }
3810   PetscCall(PetscLogGpuTimeBegin());
3811   if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
3812     if (cusp->cooPerm_a) { /* there are repeated entries in d_v[], and we need to add these them */
3813       THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
3814       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3815       /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output)
3816         cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[].
3817         cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero.
3818       */
3819       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3820       thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>());
3821       delete cooPerm_w;
3822     } else {
3823       /* all nonzeros in d_v[] are unique entries */
3824       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3825                                                                 matrix->values->begin()));
3826       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3827                                                                 matrix->values->end()));
3828       thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]]  */
3829     }
3830   } else {
3831     if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
3832       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3833       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3834     } else {
3835       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3836                                                                 matrix->values->begin()));
3837       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3838                                                                 matrix->values->end()));
3839       thrust::for_each(zibit,zieit,VecCUDAEquals());
3840     }
3841   }
3842   PetscCall(PetscLogGpuTimeEnd());
3843 finalize:
3844   delete cooPerm_v;
3845   A->offloadmask = PETSC_OFFLOAD_GPU;
3846   PetscCall(PetscObjectStateIncrease((PetscObject)A));
3847   /* shorter version of MatAssemblyEnd_SeqAIJ */
3848   PetscCall(PetscInfo(A,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",A->rmap->n,A->cmap->n,a->nz));
3849   PetscCall(PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n"));
3850   PetscCall(PetscInfo(A,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",a->rmax));
3851   a->reallocs         = 0;
3852   A->info.mallocs    += 0;
3853   A->info.nz_unneeded = 0;
3854   A->assembled = A->was_assembled = PETSC_TRUE;
3855   A->num_ass++;
3856   PetscFunctionReturn(0);
3857 }
3858 
3859 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
3860 {
3861   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3862 
3863   PetscFunctionBegin;
3864   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3865   if (!cusp) PetscFunctionReturn(0);
3866   if (destroy) {
3867     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format));
3868     delete cusp->csr2csc_i;
3869     cusp->csr2csc_i = NULL;
3870   }
3871   A->transupdated = PETSC_FALSE;
3872   PetscFunctionReturn(0);
3873 }
3874 
3875 #include <thrust/binary_search.h>
3876 /* 'Basic' means it only works when coo_i[] and coo_j[] do not contain negative indices */
3877 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(Mat A, PetscCount n, const PetscInt coo_i[], const PetscInt coo_j[])
3878 {
3879   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3880   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
3881   PetscInt           cooPerm_n, nzr = 0;
3882 
3883   PetscFunctionBegin;
3884   PetscCall(PetscLayoutSetUp(A->rmap));
3885   PetscCall(PetscLayoutSetUp(A->cmap));
3886   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
3887   if (n != cooPerm_n) {
3888     delete cusp->cooPerm;
3889     delete cusp->cooPerm_a;
3890     cusp->cooPerm = NULL;
3891     cusp->cooPerm_a = NULL;
3892   }
3893   if (n) {
3894     THRUSTINTARRAY d_i(n);
3895     THRUSTINTARRAY d_j(n);
3896     THRUSTINTARRAY ii(A->rmap->n);
3897 
3898     if (!cusp->cooPerm)   { cusp->cooPerm   = new THRUSTINTARRAY(n); }
3899     if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); }
3900 
3901     PetscCall(PetscLogCpuToGpu(2.*n*sizeof(PetscInt)));
3902     d_i.assign(coo_i,coo_i+n);
3903     d_j.assign(coo_j,coo_j+n);
3904 
3905     /* Ex.
3906       n = 6
3907       coo_i = [3,3,1,4,1,4]
3908       coo_j = [3,2,2,5,2,6]
3909     */
3910     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin()));
3911     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end()));
3912 
3913     PetscCall(PetscLogGpuTimeBegin());
3914     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
3915     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */
3916     *cusp->cooPerm_a = d_i; /* copy the sorted array */
3917     THRUSTINTARRAY w = d_j;
3918 
3919     /*
3920       d_i     = [1,1,3,3,4,4]
3921       d_j     = [2,2,2,3,5,6]
3922       cooPerm = [2,4,1,0,3,5]
3923     */
3924     auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */
3925 
3926     /*
3927       d_i     = [1,3,3,4,4,x]
3928                             ^ekey
3929       d_j     = [2,2,3,5,6,x]
3930                            ^nekye
3931     */
3932     if (nekey == ekey) { /* all entries are unique */
3933       delete cusp->cooPerm_a;
3934       cusp->cooPerm_a = NULL;
3935     } else { /* Stefano: I couldn't come up with a more elegant algorithm */
3936       /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */
3937       adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/
3938       adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff());                                              /* w:         [2,2,2,3,5,6] => [2,0,0,1,1,1]*/
3939       (*cusp->cooPerm_a)[0] = 0; /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */
3940       w[0] = 0;
3941       thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); /* cooPerm_a =          [0,0,1,1,1,1]*/
3942       thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/
3943     }
3944     thrust::counting_iterator<PetscInt> search_begin(0);
3945     thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */
3946                         search_begin, search_begin + A->rmap->n,  /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */
3947                         ii.begin()); /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */
3948     PetscCall(PetscLogGpuTimeEnd());
3949 
3950     PetscCall(MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i));
3951     a->singlemalloc = PETSC_FALSE;
3952     a->free_a       = PETSC_TRUE;
3953     a->free_ij      = PETSC_TRUE;
3954     PetscCall(PetscMalloc1(A->rmap->n+1,&a->i));
3955     a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */
3956     PetscCallCUDA(cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost));
3957     a->nz = a->maxnz = a->i[A->rmap->n];
3958     a->rmax = 0;
3959     PetscCall(PetscMalloc1(a->nz,&a->a));
3960     PetscCall(PetscMalloc1(a->nz,&a->j));
3961     PetscCallCUDA(cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost));
3962     if (!a->ilen) PetscCall(PetscMalloc1(A->rmap->n,&a->ilen));
3963     if (!a->imax) PetscCall(PetscMalloc1(A->rmap->n,&a->imax));
3964     for (PetscInt i = 0; i < A->rmap->n; i++) {
3965       const PetscInt nnzr = a->i[i+1] - a->i[i];
3966       nzr += (PetscInt)!!(nnzr);
3967       a->ilen[i] = a->imax[i] = nnzr;
3968       a->rmax = PetscMax(a->rmax,nnzr);
3969     }
3970     a->nonzerorowcnt = nzr;
3971     A->preallocated = PETSC_TRUE;
3972     PetscCall(PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt)));
3973     PetscCall(MatMarkDiagonal_SeqAIJ(A));
3974   } else {
3975     PetscCall(MatSeqAIJSetPreallocation(A,0,NULL));
3976   }
3977   PetscCall(MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE));
3978 
3979   /* We want to allocate the CUSPARSE struct for matvec now.
3980      The code is so convoluted now that I prefer to copy zeros */
3981   PetscCall(PetscArrayzero(a->a,a->nz));
3982   PetscCall(MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6));
3983   A->offloadmask = PETSC_OFFLOAD_CPU;
3984   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3985   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE));
3986   PetscFunctionReturn(0);
3987 }
3988 
3989 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[])
3990 {
3991   Mat_SeqAIJ         *seq;
3992   Mat_SeqAIJCUSPARSE *dev;
3993   PetscBool          coo_basic = PETSC_TRUE;
3994   PetscMemType       mtype = PETSC_MEMTYPE_DEVICE;
3995 
3996   PetscFunctionBegin;
3997   PetscCall(MatResetPreallocationCOO_SeqAIJ(mat));
3998   PetscCall(MatResetPreallocationCOO_SeqAIJCUSPARSE(mat));
3999   if (coo_i) {
4000     PetscCall(PetscGetMemType(coo_i,&mtype));
4001     if (PetscMemTypeHost(mtype)) {
4002       for (PetscCount k=0; k<coo_n; k++) {
4003         if (coo_i[k] < 0 || coo_j[k] < 0) {coo_basic = PETSC_FALSE; break;}
4004       }
4005     }
4006   }
4007 
4008   if (coo_basic) { /* i,j are on device or do not contain negative indices */
4009     PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(mat,coo_n,coo_i,coo_j));
4010   } else {
4011     PetscCall(MatSetPreallocationCOO_SeqAIJ(mat,coo_n,coo_i,coo_j));
4012     mat->offloadmask = PETSC_OFFLOAD_CPU;
4013     PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat));
4014     seq  = static_cast<Mat_SeqAIJ*>(mat->data);
4015     dev  = static_cast<Mat_SeqAIJCUSPARSE*>(mat->spptr);
4016     PetscCallCUDA(cudaMalloc((void**)&dev->jmap_d,(seq->nz+1)*sizeof(PetscCount)));
4017     PetscCallCUDA(cudaMemcpy(dev->jmap_d,seq->jmap,(seq->nz+1)*sizeof(PetscCount),cudaMemcpyHostToDevice));
4018     PetscCallCUDA(cudaMalloc((void**)&dev->perm_d,seq->Atot*sizeof(PetscCount)));
4019     PetscCallCUDA(cudaMemcpy(dev->perm_d,seq->perm,seq->Atot*sizeof(PetscCount),cudaMemcpyHostToDevice));
4020     dev->use_extended_coo = PETSC_TRUE;
4021   }
4022   PetscFunctionReturn(0);
4023 }
4024 
4025 __global__ static void MatAddCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount jmap[],const PetscCount perm[],InsertMode imode,PetscScalar a[])
4026 {
4027   PetscCount        i = blockIdx.x*blockDim.x + threadIdx.x;
4028   const PetscCount  grid_size = gridDim.x * blockDim.x;
4029   for (; i<nnz; i+= grid_size) {
4030     PetscScalar sum = 0.0;
4031     for (PetscCount k=jmap[i]; k<jmap[i+1]; k++) sum += kv[perm[k]];
4032     a[i] = (imode == INSERT_VALUES? 0.0 : a[i]) + sum;
4033   }
4034 }
4035 
4036 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
4037 {
4038   Mat_SeqAIJ          *seq = (Mat_SeqAIJ*)A->data;
4039   Mat_SeqAIJCUSPARSE  *dev = (Mat_SeqAIJCUSPARSE*)A->spptr;
4040   PetscCount          Annz = seq->nz;
4041   PetscMemType        memtype;
4042   const PetscScalar   *v1 = v;
4043   PetscScalar         *Aa;
4044 
4045   PetscFunctionBegin;
4046   if (dev->use_extended_coo) {
4047     PetscCall(PetscGetMemType(v,&memtype));
4048     if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
4049       PetscCallCUDA(cudaMalloc((void**)&v1,seq->coo_n*sizeof(PetscScalar)));
4050       PetscCallCUDA(cudaMemcpy((void*)v1,v,seq->coo_n*sizeof(PetscScalar),cudaMemcpyHostToDevice));
4051     }
4052 
4053     if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A,&Aa));
4054     else PetscCall(MatSeqAIJCUSPARSEGetArray(A,&Aa));
4055 
4056     if (Annz) {
4057       MatAddCOOValues<<<(Annz+255)/256,256>>>(v1,Annz,dev->jmap_d,dev->perm_d,imode,Aa);
4058       PetscCallCUDA(cudaPeekAtLastError());
4059     }
4060 
4061     if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A,&Aa));
4062     else PetscCall(MatSeqAIJCUSPARSERestoreArray(A,&Aa));
4063 
4064     if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void*)v1));
4065   } else {
4066     PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(A,v,imode));
4067   }
4068   PetscFunctionReturn(0);
4069 }
4070 
4071 /*@C
4072     MatSeqAIJCUSPARSEGetIJ - returns the device row storage i and j indices for MATSEQAIJCUSPARSE matrices.
4073 
4074    Not collective
4075 
4076     Input Parameters:
4077 +   A - the matrix
4078 -   compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form
4079 
4080     Output Parameters:
4081 +   ia - the CSR row pointers
4082 -   ja - the CSR column indices
4083 
4084     Level: developer
4085 
4086     Notes:
4087       When compressed is true, the CSR structure does not contain empty rows
4088 
4089 .seealso: `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()`
4090 @*/
4091 PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int** i, const int **j)
4092 {
4093   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4094   CsrMatrix          *csr;
4095   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
4096 
4097   PetscFunctionBegin;
4098   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4099   if (!i || !j) PetscFunctionReturn(0);
4100   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4101   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4102   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4103   PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4104   csr = (CsrMatrix*)cusp->mat->mat;
4105   if (i) {
4106     if (!compressed && a->compressedrow.use) { /* need full row offset */
4107       if (!cusp->rowoffsets_gpu) {
4108         cusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
4109         cusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
4110         PetscCall(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt)));
4111       }
4112       *i = cusp->rowoffsets_gpu->data().get();
4113     } else *i = csr->row_offsets->data().get();
4114   }
4115   if (j) *j = csr->column_indices->data().get();
4116   PetscFunctionReturn(0);
4117 }
4118 
4119 /*@C
4120     MatSeqAIJCUSPARSERestoreIJ - restore the device row storage i and j indices obtained with MatSeqAIJCUSPARSEGetIJ()
4121 
4122    Not collective
4123 
4124     Input Parameters:
4125 +   A - the matrix
4126 -   compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form
4127 
4128     Output Parameters:
4129 +   ia - the CSR row pointers
4130 -   ja - the CSR column indices
4131 
4132     Level: developer
4133 
4134 .seealso: `MatSeqAIJCUSPARSEGetIJ()`
4135 @*/
4136 PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int** i, const int **j)
4137 {
4138   PetscFunctionBegin;
4139   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4140   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4141   if (i) *i = NULL;
4142   if (j) *j = NULL;
4143   PetscFunctionReturn(0);
4144 }
4145 
4146 /*@C
4147    MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
4148 
4149    Not Collective
4150 
4151    Input Parameter:
4152 .   A - a MATSEQAIJCUSPARSE matrix
4153 
4154    Output Parameter:
4155 .   a - pointer to the device data
4156 
4157    Level: developer
4158 
4159    Notes: may trigger host-device copies if up-to-date matrix data is on host
4160 
4161 .seealso: `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()`
4162 @*/
4163 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a)
4164 {
4165   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4166   CsrMatrix          *csr;
4167 
4168   PetscFunctionBegin;
4169   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4170   PetscValidPointer(a,2);
4171   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4172   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4173   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4174   PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4175   csr = (CsrMatrix*)cusp->mat->mat;
4176   PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4177   *a = csr->values->data().get();
4178   PetscFunctionReturn(0);
4179 }
4180 
4181 /*@C
4182    MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from MatSeqAIJCUSPARSEGetArrayRead()
4183 
4184    Not Collective
4185 
4186    Input Parameter:
4187 .   A - a MATSEQAIJCUSPARSE matrix
4188 
4189    Output Parameter:
4190 .   a - pointer to the device data
4191 
4192    Level: developer
4193 
4194 .seealso: `MatSeqAIJCUSPARSEGetArrayRead()`
4195 @*/
4196 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a)
4197 {
4198   PetscFunctionBegin;
4199   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4200   PetscValidPointer(a,2);
4201   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4202   *a = NULL;
4203   PetscFunctionReturn(0);
4204 }
4205 
4206 /*@C
4207    MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
4208 
4209    Not Collective
4210 
4211    Input Parameter:
4212 .   A - a MATSEQAIJCUSPARSE matrix
4213 
4214    Output Parameter:
4215 .   a - pointer to the device data
4216 
4217    Level: developer
4218 
4219    Notes: may trigger host-device copies if up-to-date matrix data is on host
4220 
4221 .seealso: `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()`
4222 @*/
4223 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a)
4224 {
4225   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4226   CsrMatrix          *csr;
4227 
4228   PetscFunctionBegin;
4229   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4230   PetscValidPointer(a,2);
4231   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4232   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4233   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4234   PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4235   csr = (CsrMatrix*)cusp->mat->mat;
4236   PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4237   *a = csr->values->data().get();
4238   A->offloadmask = PETSC_OFFLOAD_GPU;
4239   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE));
4240   PetscFunctionReturn(0);
4241 }
4242 /*@C
4243    MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from MatSeqAIJCUSPARSEGetArray()
4244 
4245    Not Collective
4246 
4247    Input Parameter:
4248 .   A - a MATSEQAIJCUSPARSE matrix
4249 
4250    Output Parameter:
4251 .   a - pointer to the device data
4252 
4253    Level: developer
4254 
4255 .seealso: `MatSeqAIJCUSPARSEGetArray()`
4256 @*/
4257 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a)
4258 {
4259   PetscFunctionBegin;
4260   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4261   PetscValidPointer(a,2);
4262   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4263   PetscCall(MatSeqAIJInvalidateDiagonal(A));
4264   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4265   *a = NULL;
4266   PetscFunctionReturn(0);
4267 }
4268 
4269 /*@C
4270    MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
4271 
4272    Not Collective
4273 
4274    Input Parameter:
4275 .   A - a MATSEQAIJCUSPARSE matrix
4276 
4277    Output Parameter:
4278 .   a - pointer to the device data
4279 
4280    Level: developer
4281 
4282    Notes: does not trigger host-device copies and flags data validity on the GPU
4283 
4284 .seealso: `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()`
4285 @*/
4286 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a)
4287 {
4288   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4289   CsrMatrix          *csr;
4290 
4291   PetscFunctionBegin;
4292   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4293   PetscValidPointer(a,2);
4294   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4295   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4296   PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4297   csr = (CsrMatrix*)cusp->mat->mat;
4298   PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4299   *a = csr->values->data().get();
4300   A->offloadmask = PETSC_OFFLOAD_GPU;
4301   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE));
4302   PetscFunctionReturn(0);
4303 }
4304 
4305 /*@C
4306    MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from MatSeqAIJCUSPARSEGetArrayWrite()
4307 
4308    Not Collective
4309 
4310    Input Parameter:
4311 .   A - a MATSEQAIJCUSPARSE matrix
4312 
4313    Output Parameter:
4314 .   a - pointer to the device data
4315 
4316    Level: developer
4317 
4318 .seealso: `MatSeqAIJCUSPARSEGetArrayWrite()`
4319 @*/
4320 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a)
4321 {
4322   PetscFunctionBegin;
4323   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4324   PetscValidPointer(a,2);
4325   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4326   PetscCall(MatSeqAIJInvalidateDiagonal(A));
4327   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4328   *a = NULL;
4329   PetscFunctionReturn(0);
4330 }
4331 
4332 struct IJCompare4
4333 {
4334   __host__ __device__
4335   inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4336   {
4337     if (t1.get<0>() < t2.get<0>()) return true;
4338     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
4339     return false;
4340   }
4341 };
4342 
4343 struct Shift
4344 {
4345   int _shift;
4346 
4347   Shift(int shift) : _shift(shift) {}
4348   __host__ __device__
4349   inline int operator() (const int &c)
4350   {
4351     return c + _shift;
4352   }
4353 };
4354 
4355 /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */
4356 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
4357 {
4358   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c;
4359   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp;
4360   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
4361   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
4362   PetscInt                     Annz,Bnnz;
4363   cusparseStatus_t             stat;
4364   PetscInt                     i,m,n,zero = 0;
4365 
4366   PetscFunctionBegin;
4367   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4368   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
4369   PetscValidPointer(C,4);
4370   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4371   PetscCheckTypeName(B,MATSEQAIJCUSPARSE);
4372   PetscCheck(A->rmap->n == B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n);
4373   PetscCheck(reuse != MAT_INPLACE_MATRIX,PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
4374   PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4375   PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4376   if (reuse == MAT_INITIAL_MATRIX) {
4377     m     = A->rmap->n;
4378     n     = A->cmap->n + B->cmap->n;
4379     PetscCall(MatCreate(PETSC_COMM_SELF,C));
4380     PetscCall(MatSetSizes(*C,m,n,m,n));
4381     PetscCall(MatSetType(*C,MATSEQAIJCUSPARSE));
4382     c     = (Mat_SeqAIJ*)(*C)->data;
4383     Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4384     Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
4385     Ccsr  = new CsrMatrix;
4386     Cmat->cprowIndices      = NULL;
4387     c->compressedrow.use    = PETSC_FALSE;
4388     c->compressedrow.nrows  = 0;
4389     c->compressedrow.i      = NULL;
4390     c->compressedrow.rindex = NULL;
4391     Ccusp->workVector       = NULL;
4392     Ccusp->nrows    = m;
4393     Ccusp->mat      = Cmat;
4394     Ccusp->mat->mat = Ccsr;
4395     Ccsr->num_rows  = m;
4396     Ccsr->num_cols  = n;
4397     PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
4398     PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
4399     PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4400     PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar)));
4401     PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar)));
4402     PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar)));
4403     PetscCallCUDA(cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
4404     PetscCallCUDA(cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice));
4405     PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
4406     PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4407     PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
4408     PetscCheck(Acusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4409     PetscCheck(Bcusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4410 
4411     Acsr = (CsrMatrix*)Acusp->mat->mat;
4412     Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4413     Annz = (PetscInt)Acsr->column_indices->size();
4414     Bnnz = (PetscInt)Bcsr->column_indices->size();
4415     c->nz = Annz + Bnnz;
4416     Ccsr->row_offsets = new THRUSTINTARRAY32(m+1);
4417     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4418     Ccsr->values = new THRUSTARRAY(c->nz);
4419     Ccsr->num_entries = c->nz;
4420     Ccusp->cooPerm = new THRUSTINTARRAY(c->nz);
4421     if (c->nz) {
4422       auto Acoo = new THRUSTINTARRAY32(Annz);
4423       auto Bcoo = new THRUSTINTARRAY32(Bnnz);
4424       auto Ccoo = new THRUSTINTARRAY32(c->nz);
4425       THRUSTINTARRAY32 *Aroff,*Broff;
4426 
4427       if (a->compressedrow.use) { /* need full row offset */
4428         if (!Acusp->rowoffsets_gpu) {
4429           Acusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
4430           Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
4431           PetscCall(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt)));
4432         }
4433         Aroff = Acusp->rowoffsets_gpu;
4434       } else Aroff = Acsr->row_offsets;
4435       if (b->compressedrow.use) { /* need full row offset */
4436         if (!Bcusp->rowoffsets_gpu) {
4437           Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
4438           Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
4439           PetscCall(PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt)));
4440         }
4441         Broff = Bcusp->rowoffsets_gpu;
4442       } else Broff = Bcsr->row_offsets;
4443       PetscCall(PetscLogGpuTimeBegin());
4444       stat = cusparseXcsr2coo(Acusp->handle,
4445                               Aroff->data().get(),
4446                               Annz,
4447                               m,
4448                               Acoo->data().get(),
4449                               CUSPARSE_INDEX_BASE_ZERO);PetscCallCUSPARSE(stat);
4450       stat = cusparseXcsr2coo(Bcusp->handle,
4451                               Broff->data().get(),
4452                               Bnnz,
4453                               m,
4454                               Bcoo->data().get(),
4455                               CUSPARSE_INDEX_BASE_ZERO);PetscCallCUSPARSE(stat);
4456       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
4457       auto Aperm = thrust::make_constant_iterator(1);
4458       auto Bperm = thrust::make_constant_iterator(0);
4459 #if PETSC_PKG_CUDA_VERSION_GE(10,0,0)
4460       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n));
4461       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n));
4462 #else
4463       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
4464       auto Bcib = Bcsr->column_indices->begin();
4465       auto Bcie = Bcsr->column_indices->end();
4466       thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n));
4467 #endif
4468       auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz);
4469       auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm));
4470       auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm));
4471       auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm));
4472       auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm));
4473       auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin()));
4474       auto p1 = Ccusp->cooPerm->begin();
4475       auto p2 = Ccusp->cooPerm->begin();
4476       thrust::advance(p2,Annz);
4477       PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4()));
4478 #if PETSC_PKG_CUDA_VERSION_LT(10,0,0)
4479       thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n));
4480 #endif
4481       auto cci = thrust::make_counting_iterator(zero);
4482       auto cce = thrust::make_counting_iterator(c->nz);
4483 #if 0 //Errors on SUMMIT cuda 11.1.0
4484       PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
4485 #else
4486       auto pred = thrust::identity<int>();
4487       PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred));
4488       PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred));
4489 #endif
4490       stat = cusparseXcoo2csr(Ccusp->handle,
4491                               Ccoo->data().get(),
4492                               c->nz,
4493                               m,
4494                               Ccsr->row_offsets->data().get(),
4495                               CUSPARSE_INDEX_BASE_ZERO);PetscCallCUSPARSE(stat);
4496       PetscCall(PetscLogGpuTimeEnd());
4497       delete wPerm;
4498       delete Acoo;
4499       delete Bcoo;
4500       delete Ccoo;
4501 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4502       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries,
4503                                Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(),
4504                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4505                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat);
4506 #endif
4507       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
4508         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
4509         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
4510         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4511         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4512         CsrMatrix *CcsrT = new CsrMatrix;
4513         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4514         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4515 
4516         (*C)->form_explicit_transpose = PETSC_TRUE;
4517         (*C)->transupdated = PETSC_TRUE;
4518         Ccusp->rowoffsets_gpu = NULL;
4519         CmatT->cprowIndices = NULL;
4520         CmatT->mat = CcsrT;
4521         CcsrT->num_rows = n;
4522         CcsrT->num_cols = m;
4523         CcsrT->num_entries = c->nz;
4524 
4525         CcsrT->row_offsets = new THRUSTINTARRAY32(n+1);
4526         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4527         CcsrT->values = new THRUSTARRAY(c->nz);
4528 
4529         PetscCall(PetscLogGpuTimeBegin());
4530         auto rT = CcsrT->row_offsets->begin();
4531         if (AT) {
4532           rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT);
4533           thrust::advance(rT,-1);
4534         }
4535         if (BT) {
4536           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz));
4537           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz));
4538           thrust::copy(titb,tite,rT);
4539         }
4540         auto cT = CcsrT->column_indices->begin();
4541         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT);
4542         if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT);
4543         auto vT = CcsrT->values->begin();
4544         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4545         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4546         PetscCall(PetscLogGpuTimeEnd());
4547 
4548         PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr));
4549         PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO));
4550         PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4551         PetscCallCUDA(cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar)));
4552         PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar)));
4553         PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar)));
4554         PetscCallCUDA(cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
4555         PetscCallCUDA(cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice));
4556         PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
4557 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4558         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries,
4559                                  CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(),
4560                                  CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4561                                  CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat);
4562 #endif
4563         Ccusp->matTranspose = CmatT;
4564       }
4565     }
4566 
4567     c->singlemalloc = PETSC_FALSE;
4568     c->free_a       = PETSC_TRUE;
4569     c->free_ij      = PETSC_TRUE;
4570     PetscCall(PetscMalloc1(m+1,&c->i));
4571     PetscCall(PetscMalloc1(c->nz,&c->j));
4572     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
4573       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4574       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4575       ii   = *Ccsr->row_offsets;
4576       jj   = *Ccsr->column_indices;
4577       PetscCallCUDA(cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
4578       PetscCallCUDA(cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
4579     } else {
4580       PetscCallCUDA(cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
4581       PetscCallCUDA(cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
4582     }
4583     PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt)));
4584     PetscCall(PetscMalloc1(m,&c->ilen));
4585     PetscCall(PetscMalloc1(m,&c->imax));
4586     c->maxnz = c->nz;
4587     c->nonzerorowcnt = 0;
4588     c->rmax = 0;
4589     for (i = 0; i < m; i++) {
4590       const PetscInt nn = c->i[i+1] - c->i[i];
4591       c->ilen[i] = c->imax[i] = nn;
4592       c->nonzerorowcnt += (PetscInt)!!nn;
4593       c->rmax = PetscMax(c->rmax,nn);
4594     }
4595     PetscCall(MatMarkDiagonal_SeqAIJ(*C));
4596     PetscCall(PetscMalloc1(c->nz,&c->a));
4597     (*C)->nonzerostate++;
4598     PetscCall(PetscLayoutSetUp((*C)->rmap));
4599     PetscCall(PetscLayoutSetUp((*C)->cmap));
4600     Ccusp->nonzerostate = (*C)->nonzerostate;
4601     (*C)->preallocated  = PETSC_TRUE;
4602   } else {
4603     PetscCheck((*C)->rmap->n == B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,(*C)->rmap->n,B->rmap->n);
4604     c = (Mat_SeqAIJ*)(*C)->data;
4605     if (c->nz) {
4606       Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4607       PetscCheck(Ccusp->cooPerm,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm");
4608       PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4609       PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate,PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate");
4610       PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
4611       PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
4612       PetscCheck(Acusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4613       PetscCheck(Bcusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4614       Acsr = (CsrMatrix*)Acusp->mat->mat;
4615       Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4616       Ccsr = (CsrMatrix*)Ccusp->mat->mat;
4617       PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %" PetscInt_FMT " != %" PetscInt_FMT,Acsr->num_entries,(PetscInt)Acsr->values->size());
4618       PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %" PetscInt_FMT " != %" PetscInt_FMT,Bcsr->num_entries,(PetscInt)Bcsr->values->size());
4619       PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %" PetscInt_FMT " != %" PetscInt_FMT,Ccsr->num_entries,(PetscInt)Ccsr->values->size());
4620       PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries,PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT,Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries);
4621       PetscCheck(Ccusp->cooPerm->size() == Ccsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %" PetscInt_FMT " != %" PetscInt_FMT,(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size());
4622       auto pmid = Ccusp->cooPerm->begin();
4623       thrust::advance(pmid,Acsr->num_entries);
4624       PetscCall(PetscLogGpuTimeBegin());
4625       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(),
4626                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin())));
4627       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(),
4628                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4629       thrust::for_each(zibait,zieait,VecCUDAEquals());
4630       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(),
4631                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4632       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(),
4633                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end())));
4634       thrust::for_each(zibbit,ziebit,VecCUDAEquals());
4635       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE));
4636       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
4637         PetscCheck(Ccusp->matTranspose,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4638         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4639         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4640         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4641         CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat;
4642         auto vT = CcsrT->values->begin();
4643         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4644         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4645         (*C)->transupdated = PETSC_TRUE;
4646       }
4647       PetscCall(PetscLogGpuTimeEnd());
4648     }
4649   }
4650   PetscCall(PetscObjectStateIncrease((PetscObject)*C));
4651   (*C)->assembled     = PETSC_TRUE;
4652   (*C)->was_assembled = PETSC_FALSE;
4653   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4654   PetscFunctionReturn(0);
4655 }
4656 
4657 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4658 {
4659   bool              dmem;
4660   const PetscScalar *av;
4661 
4662   PetscFunctionBegin;
4663   dmem = isCudaMem(v);
4664   PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A,&av));
4665   if (n && idx) {
4666     THRUSTINTARRAY widx(n);
4667     widx.assign(idx,idx+n);
4668     PetscCall(PetscLogCpuToGpu(n*sizeof(PetscInt)));
4669 
4670     THRUSTARRAY *w = NULL;
4671     thrust::device_ptr<PetscScalar> dv;
4672     if (dmem) {
4673       dv = thrust::device_pointer_cast(v);
4674     } else {
4675       w = new THRUSTARRAY(n);
4676       dv = w->data();
4677     }
4678     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
4679 
4680     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv));
4681     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n));
4682     thrust::for_each(zibit,zieit,VecCUDAEquals());
4683     if (w) {
4684       PetscCallCUDA(cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost));
4685     }
4686     delete w;
4687   } else {
4688     PetscCallCUDA(cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost));
4689   }
4690   if (!dmem) PetscCall(PetscLogCpuToGpu(n*sizeof(PetscScalar)));
4691   PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A,&av));
4692   PetscFunctionReturn(0);
4693 }
4694