xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision d7cc930e14e615e9907267aaa472dd0ccceeab82)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library.
4   CUSPARSE changed completely with CUDA 11, so this functionality requires an older CUDA.
5 */
6 #define PETSC_SKIP_SPINLOCK
7 #define PETSC_SKIP_CXX_COMPLEX_FIX
8 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
9 
10 #include <petscconf.h>
11 #include <petscpkg_version.h>
12 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
13 #include <../src/mat/impls/sbaij/seq/sbaij.h>
14 #include <../src/vec/vec/impls/dvecimpl.h>
15 #include <petsc/private/vecimpl.h>
16 #undef VecType
17 
18 #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
19 
20 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
21 
22 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
23 
24 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
25 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
26 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
27 
28 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
29 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
30 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
31 
32 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
33 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
34 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
35 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
36 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
37 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
38 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
39 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
40 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
41 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
42 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
43 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
44 
45 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
46 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
47 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
48 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**);
49 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
50 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
51 
52 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
53 {
54   cusparseStatus_t   stat;
55   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
56 
57   PetscFunctionBegin;
58   cusparsestruct->stream = stream;
59   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
60   PetscFunctionReturn(0);
61 }
62 
63 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
64 {
65   cusparseStatus_t   stat;
66   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
67 
68   PetscFunctionBegin;
69   if (cusparsestruct->handle != handle) {
70     if (cusparsestruct->handle) {
71       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
72     }
73     cusparsestruct->handle = handle;
74   }
75   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
76   PetscFunctionReturn(0);
77 }
78 
79 PetscErrorCode MatCUSPARSEClearHandle(Mat A)
80 {
81   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
82 
83   PetscFunctionBegin;
84   if (cusparsestruct->handle) cusparsestruct->handle = 0;
85   PetscFunctionReturn(0);
86 }
87 
88 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
89 {
90   PetscFunctionBegin;
91   *type = MATSOLVERCUSPARSE;
92   PetscFunctionReturn(0);
93 }
94 
95 /*MC
96   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
97   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
98   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
99   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
100   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
101   algorithms are not recommended. This class does NOT support direct solver operations.
102 
103   Level: beginner
104 
105 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
106 M*/
107 
108 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
109 {
110   PetscErrorCode ierr;
111   PetscInt       n = A->rmap->n;
112 
113   PetscFunctionBegin;
114   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
115   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
116   (*B)->factortype = ftype;
117   (*B)->useordering = PETSC_TRUE;
118   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
119 
120   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
121     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
122     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
123     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
124   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
125     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
126     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
127   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
128 
129   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
130   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
131   PetscFunctionReturn(0);
132 }
133 
134 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
135 {
136   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
137 
138   PetscFunctionBegin;
139   switch (op) {
140   case MAT_CUSPARSE_MULT:
141     cusparsestruct->format = format;
142     break;
143   case MAT_CUSPARSE_ALL:
144     cusparsestruct->format = format;
145     break;
146   default:
147     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
148   }
149   PetscFunctionReturn(0);
150 }
151 
152 /*@
153    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
154    operation. Only the MatMult operation can use different GPU storage formats
155    for MPIAIJCUSPARSE matrices.
156    Not Collective
157 
158    Input Parameters:
159 +  A - Matrix of type SEQAIJCUSPARSE
160 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
161 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
162 
163    Output Parameter:
164 
165    Level: intermediate
166 
167 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
168 @*/
169 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
170 {
171   PetscErrorCode ierr;
172 
173   PetscFunctionBegin;
174   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
175   ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
176   PetscFunctionReturn(0);
177 }
178 
179 /*@
180    MatSeqAIJCUSPARSESetGenerateTranspose - Sets the flag to explicitly generate the tranpose matrix before calling MatMultTranspose
181 
182    Collective on mat
183 
184    Input Parameters:
185 +  A - Matrix of type SEQAIJCUSPARSE
186 -  transgen - the boolean flag
187 
188    Level: intermediate
189 
190 .seealso: MATSEQAIJCUSPARSE
191 @*/
192 PetscErrorCode MatSeqAIJCUSPARSESetGenerateTranspose(Mat A,PetscBool transgen)
193 {
194   PetscErrorCode ierr;
195   PetscBool      flg;
196 
197   PetscFunctionBegin;
198   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
199   ierr = PetscObjectTypeCompare(((PetscObject)A),MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
200   if (flg) {
201     Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
202 
203     if (A->factortype) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
204     cusp->transgen = transgen;
205     if (!transgen) { /* need to destroy the transpose matrix if present to prevent from logic errors if transgen is set to true later */
206       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr);
207     }
208   }
209   PetscFunctionReturn(0);
210 }
211 
212 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
213 {
214   PetscErrorCode           ierr;
215   MatCUSPARSEStorageFormat format;
216   PetscBool                flg;
217   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
218 
219   PetscFunctionBegin;
220   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
221   if (A->factortype == MAT_FACTOR_NONE) {
222     PetscBool transgen = cusparsestruct->transgen;
223 
224     ierr = PetscOptionsBool("-mat_cusparse_transgen","Generate explicit transpose for MatMultTranspose","MatSeqAIJCUSPARSESetGenerateTranspose",transgen,&transgen,&flg);CHKERRQ(ierr);
225     if (flg) {
226       ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A,transgen);CHKERRQ(ierr);
227     }
228     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
229                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
230     if (flg) {
231       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);
232     }
233   }
234   ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
235                           "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
236   if (flg) {
237     ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
238   }
239   ierr = PetscOptionsTail();CHKERRQ(ierr);
240   PetscFunctionReturn(0);
241 }
242 
243 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
244 {
245   PetscErrorCode ierr;
246 
247   PetscFunctionBegin;
248   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
249   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
250   PetscFunctionReturn(0);
251 }
252 
253 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
254 {
255   PetscErrorCode ierr;
256 
257   PetscFunctionBegin;
258   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
259   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
260   PetscFunctionReturn(0);
261 }
262 
263 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
264 {
265   PetscErrorCode ierr;
266 
267   PetscFunctionBegin;
268   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
269   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
270   PetscFunctionReturn(0);
271 }
272 
273 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
274 {
275   PetscErrorCode ierr;
276 
277   PetscFunctionBegin;
278   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
279   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
280   PetscFunctionReturn(0);
281 }
282 
283 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
284 {
285   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
286   PetscInt                          n = A->rmap->n;
287   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
288   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
289   cusparseStatus_t                  stat;
290   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
291   const MatScalar                   *aa = a->a,*v;
292   PetscInt                          *AiLo, *AjLo;
293   PetscScalar                       *AALo;
294   PetscInt                          i,nz, nzLower, offset, rowOffset;
295   PetscErrorCode                    ierr;
296   cudaError_t                       cerr;
297 
298   PetscFunctionBegin;
299   if (!n) PetscFunctionReturn(0);
300   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
301     try {
302       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
303       nzLower=n+ai[n]-ai[1];
304 
305       /* Allocate Space for the lower triangular matrix */
306       cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
307       cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);
308       cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
309 
310       /* Fill the lower triangular matrix */
311       AiLo[0]  = (PetscInt) 0;
312       AiLo[n]  = nzLower;
313       AjLo[0]  = (PetscInt) 0;
314       AALo[0]  = (MatScalar) 1.0;
315       v        = aa;
316       vi       = aj;
317       offset   = 1;
318       rowOffset= 1;
319       for (i=1; i<n; i++) {
320         nz = ai[i+1] - ai[i];
321         /* additional 1 for the term on the diagonal */
322         AiLo[i]    = rowOffset;
323         rowOffset += nz+1;
324 
325         ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
326         ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
327 
328         offset      += nz;
329         AjLo[offset] = (PetscInt) i;
330         AALo[offset] = (MatScalar) 1.0;
331         offset      += 1;
332 
333         v  += nz;
334         vi += nz;
335       }
336 
337       /* allocate space for the triangular factor information */
338       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
339 
340       /* Create the matrix description */
341       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
342       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
343       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
344       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
345       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
346 
347       /* Create the solve analysis information */
348       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
349 
350       /* set the operation */
351       loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
352 
353       /* set the matrix */
354       loTriFactor->csrMat = new CsrMatrix;
355       loTriFactor->csrMat->num_rows = n;
356       loTriFactor->csrMat->num_cols = n;
357       loTriFactor->csrMat->num_entries = nzLower;
358 
359       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
360       loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
361 
362       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
363       loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
364 
365       loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
366       loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
367 
368       /* perform the solve analysis */
369       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
370                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
371                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
372                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
373 
374       /* assign the pointer. Is this really necessary? */
375       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
376 
377       cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
378       cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
379       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
380       ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
381     } catch(char *ex) {
382       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
383     }
384   }
385   PetscFunctionReturn(0);
386 }
387 
388 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
389 {
390   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
391   PetscInt                          n = A->rmap->n;
392   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
393   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
394   cusparseStatus_t                  stat;
395   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
396   const MatScalar                   *aa = a->a,*v;
397   PetscInt                          *AiUp, *AjUp;
398   PetscScalar                       *AAUp;
399   PetscInt                          i,nz, nzUpper, offset;
400   PetscErrorCode                    ierr;
401   cudaError_t                       cerr;
402 
403   PetscFunctionBegin;
404   if (!n) PetscFunctionReturn(0);
405   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
406     try {
407       /* next, figure out the number of nonzeros in the upper triangular matrix. */
408       nzUpper = adiag[0]-adiag[n];
409 
410       /* Allocate Space for the upper triangular matrix */
411       cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
412       cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
413       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
414 
415       /* Fill the upper triangular matrix */
416       AiUp[0]=(PetscInt) 0;
417       AiUp[n]=nzUpper;
418       offset = nzUpper;
419       for (i=n-1; i>=0; i--) {
420         v  = aa + adiag[i+1] + 1;
421         vi = aj + adiag[i+1] + 1;
422 
423         /* number of elements NOT on the diagonal */
424         nz = adiag[i] - adiag[i+1]-1;
425 
426         /* decrement the offset */
427         offset -= (nz+1);
428 
429         /* first, set the diagonal elements */
430         AjUp[offset] = (PetscInt) i;
431         AAUp[offset] = (MatScalar)1./v[nz];
432         AiUp[i]      = AiUp[i+1] - (nz+1);
433 
434         ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
435         ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
436       }
437 
438       /* allocate space for the triangular factor information */
439       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
440 
441       /* Create the matrix description */
442       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
443       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
444       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
445       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
446       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
447 
448       /* Create the solve analysis information */
449       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
450 
451       /* set the operation */
452       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
453 
454       /* set the matrix */
455       upTriFactor->csrMat = new CsrMatrix;
456       upTriFactor->csrMat->num_rows = n;
457       upTriFactor->csrMat->num_cols = n;
458       upTriFactor->csrMat->num_entries = nzUpper;
459 
460       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
461       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
462 
463       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
464       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
465 
466       upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
467       upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
468 
469       /* perform the solve analysis */
470       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
471                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
472                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
473                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
474 
475       /* assign the pointer. Is this really necessary? */
476       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
477 
478       cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
479       cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
480       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
481       ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
482     } catch(char *ex) {
483       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
484     }
485   }
486   PetscFunctionReturn(0);
487 }
488 
489 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
490 {
491   PetscErrorCode               ierr;
492   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
493   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
494   IS                           isrow = a->row,iscol = a->icol;
495   PetscBool                    row_identity,col_identity;
496   const PetscInt               *r,*c;
497   PetscInt                     n = A->rmap->n;
498 
499   PetscFunctionBegin;
500   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
501   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
502   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
503 
504   cusparseTriFactors->workVector = new THRUSTARRAY(n);
505   cusparseTriFactors->nnz=a->nz;
506 
507   A->offloadmask = PETSC_OFFLOAD_BOTH;
508   /* lower triangular indices */
509   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
510   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
511   if (!row_identity) {
512     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
513     cusparseTriFactors->rpermIndices->assign(r, r+n);
514   }
515   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
516 
517   /* upper triangular indices */
518   ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
519   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
520   if (!col_identity) {
521     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
522     cusparseTriFactors->cpermIndices->assign(c, c+n);
523   }
524 
525   if (!row_identity && !col_identity) {
526     ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr);
527   } else if(!row_identity) {
528     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
529   } else if(!col_identity) {
530     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
531   }
532 
533   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
534   PetscFunctionReturn(0);
535 }
536 
537 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
538 {
539   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
540   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
541   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
542   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
543   cusparseStatus_t                  stat;
544   PetscErrorCode                    ierr;
545   cudaError_t                       cerr;
546   PetscInt                          *AiUp, *AjUp;
547   PetscScalar                       *AAUp;
548   PetscScalar                       *AALo;
549   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
550   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
551   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
552   const MatScalar                   *aa = b->a,*v;
553 
554   PetscFunctionBegin;
555   if (!n) PetscFunctionReturn(0);
556   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
557     try {
558       /* Allocate Space for the upper triangular matrix */
559       cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
560       cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
561       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
562       cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
563 
564       /* Fill the upper triangular matrix */
565       AiUp[0]=(PetscInt) 0;
566       AiUp[n]=nzUpper;
567       offset = 0;
568       for (i=0; i<n; i++) {
569         /* set the pointers */
570         v  = aa + ai[i];
571         vj = aj + ai[i];
572         nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
573 
574         /* first, set the diagonal elements */
575         AjUp[offset] = (PetscInt) i;
576         AAUp[offset] = (MatScalar)1.0/v[nz];
577         AiUp[i]      = offset;
578         AALo[offset] = (MatScalar)1.0/v[nz];
579 
580         offset+=1;
581         if (nz>0) {
582           ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
583           ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
584           for (j=offset; j<offset+nz; j++) {
585             AAUp[j] = -AAUp[j];
586             AALo[j] = AAUp[j]/v[nz];
587           }
588           offset+=nz;
589         }
590       }
591 
592       /* allocate space for the triangular factor information */
593       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
594 
595       /* Create the matrix description */
596       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
597       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
598       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
599       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
600       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
601 
602       /* Create the solve analysis information */
603       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
604 
605       /* set the operation */
606       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
607 
608       /* set the matrix */
609       upTriFactor->csrMat = new CsrMatrix;
610       upTriFactor->csrMat->num_rows = A->rmap->n;
611       upTriFactor->csrMat->num_cols = A->cmap->n;
612       upTriFactor->csrMat->num_entries = a->nz;
613 
614       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
615       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
616 
617       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
618       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
619 
620       upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
621       upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
622 
623       /* perform the solve analysis */
624       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
625                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
626                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
627                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
628 
629       /* assign the pointer. Is this really necessary? */
630       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
631 
632       /* allocate space for the triangular factor information */
633       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
634 
635       /* Create the matrix description */
636       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
637       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
638       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
639       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
640       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
641 
642       /* Create the solve analysis information */
643       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
644 
645       /* set the operation */
646       loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
647 
648       /* set the matrix */
649       loTriFactor->csrMat = new CsrMatrix;
650       loTriFactor->csrMat->num_rows = A->rmap->n;
651       loTriFactor->csrMat->num_cols = A->cmap->n;
652       loTriFactor->csrMat->num_entries = a->nz;
653 
654       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
655       loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
656 
657       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
658       loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
659 
660       loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
661       loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
662       ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
663 
664       /* perform the solve analysis */
665       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
666                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
667                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
668                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
669 
670       /* assign the pointer. Is this really necessary? */
671       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
672 
673       A->offloadmask = PETSC_OFFLOAD_BOTH;
674       cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
675       cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
676       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
677       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
678     } catch(char *ex) {
679       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
680     }
681   }
682   PetscFunctionReturn(0);
683 }
684 
685 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
686 {
687   PetscErrorCode               ierr;
688   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
689   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
690   IS                           ip = a->row;
691   const PetscInt               *rip;
692   PetscBool                    perm_identity;
693   PetscInt                     n = A->rmap->n;
694 
695   PetscFunctionBegin;
696   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
697   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
698   cusparseTriFactors->workVector = new THRUSTARRAY(n);
699   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
700 
701   /* lower triangular indices */
702   ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
703   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
704   if (!perm_identity) {
705     IS             iip;
706     const PetscInt *irip;
707 
708     ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr);
709     ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr);
710     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
711     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
712     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
713     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
714     ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr);
715     ierr = ISDestroy(&iip);CHKERRQ(ierr);
716     ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr);
717   }
718   ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
719   PetscFunctionReturn(0);
720 }
721 
722 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
723 {
724   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
725   IS             isrow = b->row,iscol = b->col;
726   PetscBool      row_identity,col_identity;
727   PetscErrorCode ierr;
728 
729   PetscFunctionBegin;
730   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
731   B->offloadmask = PETSC_OFFLOAD_CPU;
732   /* determine which version of MatSolve needs to be used. */
733   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
734   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
735   if (row_identity && col_identity) {
736     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
737     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
738     B->ops->matsolve = NULL;
739     B->ops->matsolvetranspose = NULL;
740   } else {
741     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
742     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
743     B->ops->matsolve = NULL;
744     B->ops->matsolvetranspose = NULL;
745   }
746 
747   /* get the triangular factors */
748   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
749   PetscFunctionReturn(0);
750 }
751 
752 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
753 {
754   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
755   IS             ip = b->row;
756   PetscBool      perm_identity;
757   PetscErrorCode ierr;
758 
759   PetscFunctionBegin;
760   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
761   B->offloadmask = PETSC_OFFLOAD_CPU;
762   /* determine which version of MatSolve needs to be used. */
763   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
764   if (perm_identity) {
765     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
766     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
767     B->ops->matsolve = NULL;
768     B->ops->matsolvetranspose = NULL;
769   } else {
770     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
771     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
772     B->ops->matsolve = NULL;
773     B->ops->matsolvetranspose = NULL;
774   }
775 
776   /* get the triangular factors */
777   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
778   PetscFunctionReturn(0);
779 }
780 
781 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
782 {
783   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
784   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
785   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
786   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
787   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
788   cusparseStatus_t                  stat;
789   cusparseIndexBase_t               indexBase;
790   cusparseMatrixType_t              matrixType;
791   cusparseFillMode_t                fillMode;
792   cusparseDiagType_t                diagType;
793 
794   PetscFunctionBegin;
795 
796   /*********************************************/
797   /* Now the Transpose of the Lower Tri Factor */
798   /*********************************************/
799 
800   /* allocate space for the transpose of the lower triangular factor */
801   loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
802 
803   /* set the matrix descriptors of the lower triangular factor */
804   matrixType = cusparseGetMatType(loTriFactor->descr);
805   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
806   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
807     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
808   diagType = cusparseGetMatDiagType(loTriFactor->descr);
809 
810   /* Create the matrix description */
811   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
812   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
813   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
814   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
815   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
816 
817   /* Create the solve analysis information */
818   stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
819 
820   /* set the operation */
821   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
822 
823   /* allocate GPU space for the CSC of the lower triangular factor*/
824   loTriFactorT->csrMat = new CsrMatrix;
825   loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows;
826   loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols;
827   loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
828   loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1);
829   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries);
830   loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries);
831 
832   /* compute the transpose of the lower triangular factor, i.e. the CSC */
833   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
834                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
835                           loTriFactor->csrMat->values->data().get(),
836                           loTriFactor->csrMat->row_offsets->data().get(),
837                           loTriFactor->csrMat->column_indices->data().get(),
838                           loTriFactorT->csrMat->values->data().get(),
839                           loTriFactorT->csrMat->column_indices->data().get(),
840                           loTriFactorT->csrMat->row_offsets->data().get(),
841                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
842 
843   /* perform the solve analysis on the transposed matrix */
844   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
845                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries,
846                            loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
847                            loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(),
848                            loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
849 
850   /* assign the pointer. Is this really necessary? */
851   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
852 
853   /*********************************************/
854   /* Now the Transpose of the Upper Tri Factor */
855   /*********************************************/
856 
857   /* allocate space for the transpose of the upper triangular factor */
858   upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
859 
860   /* set the matrix descriptors of the upper triangular factor */
861   matrixType = cusparseGetMatType(upTriFactor->descr);
862   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
863   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
864     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
865   diagType = cusparseGetMatDiagType(upTriFactor->descr);
866 
867   /* Create the matrix description */
868   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
869   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
870   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
871   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
872   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
873 
874   /* Create the solve analysis information */
875   stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
876 
877   /* set the operation */
878   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
879 
880   /* allocate GPU space for the CSC of the upper triangular factor*/
881   upTriFactorT->csrMat = new CsrMatrix;
882   upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows;
883   upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols;
884   upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
885   upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1);
886   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries);
887   upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries);
888 
889   /* compute the transpose of the upper triangular factor, i.e. the CSC */
890   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
891                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
892                           upTriFactor->csrMat->values->data().get(),
893                           upTriFactor->csrMat->row_offsets->data().get(),
894                           upTriFactor->csrMat->column_indices->data().get(),
895                           upTriFactorT->csrMat->values->data().get(),
896                           upTriFactorT->csrMat->column_indices->data().get(),
897                           upTriFactorT->csrMat->row_offsets->data().get(),
898                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
899 
900   /* perform the solve analysis on the transposed matrix */
901   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
902                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries,
903                            upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
904                            upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(),
905                            upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
906 
907   /* assign the pointer. Is this really necessary? */
908   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
909   PetscFunctionReturn(0);
910 }
911 
912 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
913 {
914   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
915   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
916   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
917   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
918   cusparseStatus_t             stat;
919   cusparseIndexBase_t          indexBase;
920   cudaError_t                  err;
921   PetscErrorCode               ierr;
922 
923   PetscFunctionBegin;
924   if (!cusparsestruct->transgen || cusparsestruct->matTranspose) PetscFunctionReturn(0);
925   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
926   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
927   /* create cusparse matrix */
928   matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
929   stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
930   indexBase = cusparseGetMatIndexBase(matstruct->descr);
931   stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
932   stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
933 
934   /* set alpha and beta */
935   err = cudaMalloc((void **)&(matstructT->alpha),    sizeof(PetscScalar));CHKERRCUDA(err);
936   err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
937   err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
938   err = cudaMemcpy(matstructT->alpha,    &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
939   err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
940   err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
941   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
942 
943   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
944     CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
945     CsrMatrix *matrixT= new CsrMatrix;
946     matrixT->num_rows = A->cmap->n;
947     matrixT->num_cols = A->rmap->n;
948     matrixT->num_entries = a->nz;
949     matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
950     matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
951     matrixT->values = new THRUSTARRAY(a->nz);
952 
953     cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
954     cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
955     /* compute the transpose, i.e. the CSC */
956     indexBase = cusparseGetMatIndexBase(matstruct->descr);
957     stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
958                             A->cmap->n, matrix->num_entries,
959                             matrix->values->data().get(),
960                             cusparsestruct->rowoffsets_gpu->data().get(),
961                             matrix->column_indices->data().get(),
962                             matrixT->values->data().get(),
963                             matrixT->column_indices->data().get(),
964                             matrixT->row_offsets->data().get(),
965                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
966     matstructT->mat = matrixT;
967   } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
968     CsrMatrix *temp = new CsrMatrix;
969     CsrMatrix *tempT = new CsrMatrix;
970 
971     /* First convert HYB to CSR */
972     temp->num_rows = A->rmap->n;
973     temp->num_cols = A->cmap->n;
974     temp->num_entries = a->nz;
975     temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
976     temp->column_indices = new THRUSTINTARRAY32(a->nz);
977     temp->values = new THRUSTARRAY(a->nz);
978 
979     stat = cusparse_hyb2csr(cusparsestruct->handle,
980                             matstruct->descr, (cusparseHybMat_t)matstruct->mat,
981                             temp->values->data().get(),
982                             temp->row_offsets->data().get(),
983                             temp->column_indices->data().get());CHKERRCUSPARSE(stat);
984 
985     /* Next, convert CSR to CSC (i.e. the matrix transpose) */
986     tempT->num_rows = A->rmap->n;
987     tempT->num_cols = A->cmap->n;
988     tempT->num_entries = a->nz;
989     tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
990     tempT->column_indices = new THRUSTINTARRAY32(a->nz);
991     tempT->values = new THRUSTARRAY(a->nz);
992 
993     stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
994                             temp->num_cols, temp->num_entries,
995                             temp->values->data().get(),
996                             temp->row_offsets->data().get(),
997                             temp->column_indices->data().get(),
998                             tempT->values->data().get(),
999                             tempT->column_indices->data().get(),
1000                             tempT->row_offsets->data().get(),
1001                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1002 
1003     /* Last, convert CSC to HYB */
1004     cusparseHybMat_t hybMat;
1005     stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1006     cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1007       CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1008     stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1009                             matstructT->descr, tempT->values->data().get(),
1010                             tempT->row_offsets->data().get(),
1011                             tempT->column_indices->data().get(),
1012                             hybMat, 0, partition);CHKERRCUSPARSE(stat);
1013 
1014     /* assign the pointer */
1015     matstructT->mat = hybMat;
1016     /* delete temporaries */
1017     if (tempT) {
1018       if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1019       if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1020       if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1021       delete (CsrMatrix*) tempT;
1022     }
1023     if (temp) {
1024       if (temp->values) delete (THRUSTARRAY*) temp->values;
1025       if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1026       if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1027       delete (CsrMatrix*) temp;
1028     }
1029   }
1030   err  = WaitForCUDA();CHKERRCUDA(err);
1031   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1032   ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1033   /* the compressed row indices is not used for matTranspose */
1034   matstructT->cprowIndices = NULL;
1035   /* assign the pointer */
1036   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1037   PetscFunctionReturn(0);
1038 }
1039 
1040 /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1041 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1042 {
1043   PetscInt                              n = xx->map->n;
1044   const PetscScalar                     *barray;
1045   PetscScalar                           *xarray;
1046   thrust::device_ptr<const PetscScalar> bGPU;
1047   thrust::device_ptr<PetscScalar>       xGPU;
1048   cusparseStatus_t                      stat;
1049   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1050   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1051   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1052   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1053   PetscErrorCode                        ierr;
1054   cudaError_t                           cerr;
1055 
1056   PetscFunctionBegin;
1057   /* Analyze the matrix and create the transpose ... on the fly */
1058   if (!loTriFactorT && !upTriFactorT) {
1059     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1060     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1061     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1062   }
1063 
1064   /* Get the GPU pointers */
1065   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1066   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1067   xGPU = thrust::device_pointer_cast(xarray);
1068   bGPU = thrust::device_pointer_cast(barray);
1069 
1070   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1071   /* First, reorder with the row permutation */
1072   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1073                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1074                xGPU);
1075 
1076   /* First, solve U */
1077   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1078                         upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1079                         upTriFactorT->csrMat->values->data().get(),
1080                         upTriFactorT->csrMat->row_offsets->data().get(),
1081                         upTriFactorT->csrMat->column_indices->data().get(),
1082                         upTriFactorT->solveInfo,
1083                         xarray, tempGPU->data().get());CHKERRCUSPARSE(stat);
1084 
1085   /* Then, solve L */
1086   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1087                         loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1088                         loTriFactorT->csrMat->values->data().get(),
1089                         loTriFactorT->csrMat->row_offsets->data().get(),
1090                         loTriFactorT->csrMat->column_indices->data().get(),
1091                         loTriFactorT->solveInfo,
1092                         tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat);
1093 
1094   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1095   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1096                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1097                tempGPU->begin());
1098 
1099   /* Copy the temporary to the full solution. */
1100   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1101 
1102   /* restore */
1103   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1104   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1105   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1106   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1107   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1108   PetscFunctionReturn(0);
1109 }
1110 
1111 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1112 {
1113   const PetscScalar                 *barray;
1114   PetscScalar                       *xarray;
1115   cusparseStatus_t                  stat;
1116   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1117   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1118   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1119   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1120   PetscErrorCode                    ierr;
1121   cudaError_t                       cerr;
1122 
1123   PetscFunctionBegin;
1124   /* Analyze the matrix and create the transpose ... on the fly */
1125   if (!loTriFactorT && !upTriFactorT) {
1126     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1127     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1128     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1129   }
1130 
1131   /* Get the GPU pointers */
1132   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1133   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1134 
1135   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1136   /* First, solve U */
1137   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1138                         upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1139                         upTriFactorT->csrMat->values->data().get(),
1140                         upTriFactorT->csrMat->row_offsets->data().get(),
1141                         upTriFactorT->csrMat->column_indices->data().get(),
1142                         upTriFactorT->solveInfo,
1143                         barray, tempGPU->data().get());CHKERRCUSPARSE(stat);
1144 
1145   /* Then, solve L */
1146   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1147                         loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1148                         loTriFactorT->csrMat->values->data().get(),
1149                         loTriFactorT->csrMat->row_offsets->data().get(),
1150                         loTriFactorT->csrMat->column_indices->data().get(),
1151                         loTriFactorT->solveInfo,
1152                         tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat);
1153 
1154   /* restore */
1155   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1156   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1157   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1158   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1159   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1160   PetscFunctionReturn(0);
1161 }
1162 
1163 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1164 {
1165   const PetscScalar                     *barray;
1166   PetscScalar                           *xarray;
1167   thrust::device_ptr<const PetscScalar> bGPU;
1168   thrust::device_ptr<PetscScalar>       xGPU;
1169   cusparseStatus_t                      stat;
1170   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1171   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1172   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1173   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1174   PetscErrorCode                        ierr;
1175   cudaError_t                           cerr;
1176 
1177   PetscFunctionBegin;
1178 
1179   /* Get the GPU pointers */
1180   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1181   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1182   xGPU = thrust::device_pointer_cast(xarray);
1183   bGPU = thrust::device_pointer_cast(barray);
1184 
1185   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1186   /* First, reorder with the row permutation */
1187   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1188                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1189                tempGPU->begin());
1190 
1191   /* Next, solve L */
1192   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1193                         loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1194                         loTriFactor->csrMat->values->data().get(),
1195                         loTriFactor->csrMat->row_offsets->data().get(),
1196                         loTriFactor->csrMat->column_indices->data().get(),
1197                         loTriFactor->solveInfo,
1198                         tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat);
1199 
1200   /* Then, solve U */
1201   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1202                         upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1203                         upTriFactor->csrMat->values->data().get(),
1204                         upTriFactor->csrMat->row_offsets->data().get(),
1205                         upTriFactor->csrMat->column_indices->data().get(),
1206                         upTriFactor->solveInfo,
1207                         xarray, tempGPU->data().get());CHKERRCUSPARSE(stat);
1208 
1209   /* Last, reorder with the column permutation */
1210   thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1211                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
1212                xGPU);
1213 
1214   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1215   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1216   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1217   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1218   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1219   PetscFunctionReturn(0);
1220 }
1221 
1222 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1223 {
1224   const PetscScalar                 *barray;
1225   PetscScalar                       *xarray;
1226   cusparseStatus_t                  stat;
1227   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1228   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1229   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1230   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1231   PetscErrorCode                    ierr;
1232   cudaError_t                       cerr;
1233 
1234   PetscFunctionBegin;
1235   /* Get the GPU pointers */
1236   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1237   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1238 
1239   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1240   /* First, solve L */
1241   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1242                         loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1243                         loTriFactor->csrMat->values->data().get(),
1244                         loTriFactor->csrMat->row_offsets->data().get(),
1245                         loTriFactor->csrMat->column_indices->data().get(),
1246                         loTriFactor->solveInfo,
1247                         barray, tempGPU->data().get());CHKERRCUSPARSE(stat);
1248 
1249   /* Next, solve U */
1250   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1251                         upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1252                         upTriFactor->csrMat->values->data().get(),
1253                         upTriFactor->csrMat->row_offsets->data().get(),
1254                         upTriFactor->csrMat->column_indices->data().get(),
1255                         upTriFactor->solveInfo,
1256                         tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat);
1257 
1258   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1259   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1260   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1261   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1262   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1263   PetscFunctionReturn(0);
1264 }
1265 
1266 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1267 {
1268   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1269   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
1270   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1271   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
1272   PetscErrorCode               ierr;
1273   cusparseStatus_t             stat;
1274   cudaError_t                  err;
1275 
1276   PetscFunctionBegin;
1277   if (A->boundtocpu) PetscFunctionReturn(0);
1278   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1279     if (A->was_assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) {
1280       /* Copy values only */
1281       CsrMatrix *mat,*matT;
1282       mat  = (CsrMatrix*)cusparsestruct->mat->mat;
1283 
1284       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1285       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1286       mat->values->assign(a->a, a->a+a->nz);
1287       err  = WaitForCUDA();CHKERRCUDA(err);
1288       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1289       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
1290       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1291 
1292       /* Update matT when it was built before */
1293       if (cusparsestruct->matTranspose) {
1294         cusparseIndexBase_t indexBase = cusparseGetMatIndexBase(cusparsestruct->mat->descr);
1295         matT = (CsrMatrix*)cusparsestruct->matTranspose->mat;
1296         ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1297         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1298         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1299                                  A->cmap->n, mat->num_entries,
1300                                  mat->values->data().get(),
1301                                  cusparsestruct->rowoffsets_gpu->data().get(),
1302                                  mat->column_indices->data().get(),
1303                                  matT->values->data().get(),
1304                                  matT->column_indices->data().get(),
1305                                  matT->row_offsets->data().get(),
1306                                  CUSPARSE_ACTION_NUMERIC,indexBase);CHKERRCUSPARSE(stat);
1307         err  = WaitForCUDA();CHKERRCUDA(err);
1308         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1309         ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1310       }
1311     } else {
1312       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1313       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1314       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr);
1315       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->matTranspose,cusparsestruct->format);CHKERRQ(ierr);
1316       delete cusparsestruct->workVector;
1317       delete cusparsestruct->rowoffsets_gpu;
1318       try {
1319         if (a->compressedrow.use) {
1320           m    = a->compressedrow.nrows;
1321           ii   = a->compressedrow.i;
1322           ridx = a->compressedrow.rindex;
1323         } else {
1324           m    = A->rmap->n;
1325           ii   = a->i;
1326           ridx = NULL;
1327         }
1328         cusparsestruct->nrows = m;
1329 
1330         /* create cusparse matrix */
1331         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1332         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
1333         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
1334         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1335 
1336         err = cudaMalloc((void **)&(matstruct->alpha),    sizeof(PetscScalar));CHKERRCUDA(err);
1337         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1338         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1339         err = cudaMemcpy(matstruct->alpha,    &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1340         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1341         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1342         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1343 
1344         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1345         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1346           /* set the matrix */
1347           CsrMatrix *matrix= new CsrMatrix;
1348           matrix->num_rows = m;
1349           matrix->num_cols = A->cmap->n;
1350           matrix->num_entries = a->nz;
1351           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1352           matrix->row_offsets->assign(ii, ii + m+1);
1353 
1354           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1355           matrix->column_indices->assign(a->j, a->j+a->nz);
1356 
1357           matrix->values = new THRUSTARRAY(a->nz);
1358           matrix->values->assign(a->a, a->a+a->nz);
1359 
1360           /* assign the pointer */
1361           matstruct->mat = matrix;
1362 
1363         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1364           CsrMatrix *matrix= new CsrMatrix;
1365           matrix->num_rows = m;
1366           matrix->num_cols = A->cmap->n;
1367           matrix->num_entries = a->nz;
1368           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1369           matrix->row_offsets->assign(ii, ii + m+1);
1370 
1371           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1372           matrix->column_indices->assign(a->j, a->j+a->nz);
1373 
1374           matrix->values = new THRUSTARRAY(a->nz);
1375           matrix->values->assign(a->a, a->a+a->nz);
1376 
1377           cusparseHybMat_t hybMat;
1378           stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1379           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1380             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1381           stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols,
1382               matstruct->descr, matrix->values->data().get(),
1383               matrix->row_offsets->data().get(),
1384               matrix->column_indices->data().get(),
1385               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1386           /* assign the pointer */
1387           matstruct->mat = hybMat;
1388 
1389           if (matrix) {
1390             if (matrix->values) delete (THRUSTARRAY*)matrix->values;
1391             if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices;
1392             if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets;
1393             delete (CsrMatrix*)matrix;
1394           }
1395         }
1396 
1397         /* assign the compressed row indices */
1398         if (a->compressedrow.use) {
1399           cusparsestruct->workVector = new THRUSTARRAY(m);
1400           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
1401           matstruct->cprowIndices->assign(ridx,ridx+m);
1402           tmp = m;
1403         } else {
1404           cusparsestruct->workVector = NULL;
1405           matstruct->cprowIndices    = NULL;
1406           tmp = 0;
1407         }
1408         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
1409 
1410         /* assign the pointer */
1411         cusparsestruct->mat = matstruct;
1412       } catch(char *ex) {
1413         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1414       }
1415       err  = WaitForCUDA();CHKERRCUDA(err);
1416       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1417       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1418       cusparsestruct->nonzerostate = A->nonzerostate;
1419     }
1420     A->offloadmask = PETSC_OFFLOAD_BOTH;
1421   }
1422   PetscFunctionReturn(0);
1423 }
1424 
1425 struct VecCUDAPlusEquals
1426 {
1427   template <typename Tuple>
1428   __host__ __device__
1429   void operator()(Tuple t)
1430   {
1431     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1432   }
1433 };
1434 
1435 struct VecCUDAEqualsReverse
1436 {
1437   template <typename Tuple>
1438   __host__ __device__
1439   void operator()(Tuple t)
1440   {
1441     thrust::get<0>(t) = thrust::get<1>(t);
1442   }
1443 };
1444 
1445 typedef struct {
1446   PetscBool   cisdense;
1447   PetscScalar *Bt;
1448   Mat         X;
1449 } MatMatCusparse;
1450 
1451 static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
1452 {
1453   PetscErrorCode ierr;
1454   MatMatCusparse *mmdata = (MatMatCusparse *)data;
1455   cudaError_t    cerr;
1456 
1457   PetscFunctionBegin;
1458   cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr);
1459   ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr);
1460   ierr = PetscFree(data);CHKERRQ(ierr);
1461   PetscFunctionReturn(0);
1462 }
1463 
1464 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
1465 
1466 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1467 {
1468   Mat_Product                  *product = C->product;
1469   Mat                          A,B;
1470   PetscInt                     m,n,k,blda,clda;
1471   PetscBool                    flg,biscuda;
1472   Mat_SeqAIJCUSPARSE           *cusp;
1473   cusparseStatus_t             stat;
1474   cusparseOperation_t          opA;
1475   cusparseMatDescr_t           matA;
1476   const PetscScalar            *barray;
1477   PetscScalar                  *carray;
1478   PetscErrorCode               ierr;
1479   MatMatCusparse               *mmdata;
1480   Mat_SeqAIJCUSPARSEMultStruct *mat;
1481   CsrMatrix                    *csrmat;
1482   cudaError_t                  cuer;
1483 
1484   PetscFunctionBegin;
1485   MatCheckProduct(C,1);
1486   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
1487   mmdata = (MatMatCusparse*)product->data;
1488   A    = product->A;
1489   B    = product->B;
1490   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
1491   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
1492   /* currently CopyToGpu does not copy if the matrix is bound to CPU
1493      Instead of silently accepting the wrong answer, I prefer to raise the error */
1494   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
1495   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1496   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1497   switch (product->type) {
1498   case MATPRODUCT_AB:
1499   case MATPRODUCT_PtAP:
1500     mat = cusp->mat;
1501     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1502     m   = A->rmap->n;
1503     k   = A->cmap->n;
1504     n   = B->cmap->n;
1505     break;
1506   case MATPRODUCT_AtB:
1507     if (!cusp->transgen) {
1508       mat = cusp->mat;
1509       opA = CUSPARSE_OPERATION_TRANSPOSE;
1510     } else {
1511       ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1512       mat  = cusp->matTranspose;
1513       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
1514     }
1515     m = A->cmap->n;
1516     k = A->rmap->n;
1517     n = B->cmap->n;
1518     break;
1519   case MATPRODUCT_ABt:
1520   case MATPRODUCT_RARt:
1521     mat = cusp->mat;
1522     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1523     m   = A->rmap->n;
1524     k   = B->cmap->n;
1525     n   = B->rmap->n;
1526     break;
1527   default:
1528     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
1529   }
1530   if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct");
1531   matA   = mat->descr;
1532   csrmat = (CsrMatrix*)mat->mat;
1533   /* if the user passed a CPU matrix, copy the data to the GPU */
1534   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr);
1535   if (!biscuda) {
1536     ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
1537   }
1538   ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr);
1539   ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr);
1540   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
1541     ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
1542     ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr);
1543   } else {
1544     ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr);
1545     ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr);
1546   }
1547 
1548   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1549 
1550   /* cusparse spmm does not support transpose on B */
1551   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
1552     cublasHandle_t cublasv2handle;
1553     cublasStatus_t cerr;
1554 
1555     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
1556     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
1557                        B->cmap->n,B->rmap->n,
1558                        &PETSC_CUSPARSE_ONE ,barray,blda,
1559                        &PETSC_CUSPARSE_ZERO,barray,blda,
1560                        mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr);
1561     blda = B->cmap->n;
1562   }
1563 
1564   /* perform the MatMat operation */
1565   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
1566                            csrmat->num_entries,mat->alpha,matA,
1567                            csrmat->values->data().get(),
1568                            csrmat->row_offsets->data().get(),
1569                            csrmat->column_indices->data().get(),
1570                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
1571                            carray,clda);CHKERRCUSPARSE(stat);
1572   cuer = WaitForCUDA();CHKERRCUDA(cuer);
1573   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1574   ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr);
1575   ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr);
1576   if (product->type == MATPRODUCT_RARt) {
1577     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
1578     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
1579   } else if (product->type == MATPRODUCT_PtAP) {
1580     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
1581     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
1582   } else {
1583     ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr);
1584   }
1585   if (mmdata->cisdense) {
1586     ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr);
1587   }
1588   if (!biscuda) {
1589     ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
1590   }
1591   PetscFunctionReturn(0);
1592 }
1593 
1594 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1595 {
1596   Mat_Product        *product = C->product;
1597   Mat                A,B;
1598   PetscInt           m,n;
1599   PetscBool          cisdense,flg;
1600   PetscErrorCode     ierr;
1601   MatMatCusparse     *mmdata;
1602   Mat_SeqAIJCUSPARSE *cusp;
1603 
1604   PetscFunctionBegin;
1605   MatCheckProduct(C,1);
1606   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
1607   A    = product->A;
1608   B    = product->B;
1609   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
1610   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
1611   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1612   if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
1613   switch (product->type) {
1614   case MATPRODUCT_AB:
1615     m = A->rmap->n;
1616     n = B->cmap->n;
1617     break;
1618   case MATPRODUCT_AtB:
1619     m = A->cmap->n;
1620     n = B->cmap->n;
1621     break;
1622   case MATPRODUCT_ABt:
1623     m = A->rmap->n;
1624     n = B->rmap->n;
1625     break;
1626   case MATPRODUCT_PtAP:
1627     m = B->cmap->n;
1628     n = B->cmap->n;
1629     break;
1630   case MATPRODUCT_RARt:
1631     m = B->rmap->n;
1632     n = B->rmap->n;
1633     break;
1634   default:
1635     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
1636   }
1637   ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
1638   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
1639   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr);
1640   ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr);
1641 
1642   /* product data */
1643   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
1644   mmdata->cisdense = cisdense;
1645   /* cusparse spmm does not support transpose on B */
1646   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
1647     cudaError_t cerr;
1648 
1649     cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
1650   }
1651   /* for these products we need intermediate storage */
1652   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
1653     ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr);
1654     ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr);
1655     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
1656       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr);
1657     } else {
1658       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr);
1659     }
1660   }
1661   C->product->data    = mmdata;
1662   C->product->destroy = MatDestroy_MatMatCusparse;
1663 
1664   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
1665   PetscFunctionReturn(0);
1666 }
1667 
1668 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
1669 
1670 /* handles dense B */
1671 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat C)
1672 {
1673   Mat_Product    *product = C->product;
1674   PetscErrorCode ierr;
1675 
1676   PetscFunctionBegin;
1677   MatCheckProduct(C,1);
1678   if (!product->A) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing A");
1679   if (product->A->boundtocpu) {
1680     ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(C);CHKERRQ(ierr);
1681     PetscFunctionReturn(0);
1682   }
1683   switch (product->type) {
1684   case MATPRODUCT_AB:
1685   case MATPRODUCT_AtB:
1686   case MATPRODUCT_ABt:
1687   case MATPRODUCT_PtAP:
1688   case MATPRODUCT_RARt:
1689     C->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
1690   default:
1691     break;
1692   }
1693   PetscFunctionReturn(0);
1694 }
1695 
1696 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1697 {
1698   PetscErrorCode ierr;
1699 
1700   PetscFunctionBegin;
1701   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
1702   PetscFunctionReturn(0);
1703 }
1704 
1705 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
1706 {
1707   PetscErrorCode ierr;
1708 
1709   PetscFunctionBegin;
1710   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
1711   PetscFunctionReturn(0);
1712 }
1713 
1714 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1715 {
1716   PetscErrorCode ierr;
1717 
1718   PetscFunctionBegin;
1719   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
1720   PetscFunctionReturn(0);
1721 }
1722 
1723 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1724 {
1725   PetscErrorCode ierr;
1726 
1727   PetscFunctionBegin;
1728   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
1729   PetscFunctionReturn(0);
1730 }
1731 
1732 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1733 {
1734   PetscErrorCode ierr;
1735 
1736   PetscFunctionBegin;
1737   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
1738   PetscFunctionReturn(0);
1739 }
1740 
1741 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
1742 {
1743   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1744   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1745   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
1746   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
1747   PetscErrorCode               ierr;
1748   cudaError_t                  cerr;
1749   cusparseStatus_t             stat;
1750   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1751   PetscBool                    compressed;
1752 
1753   PetscFunctionBegin;
1754   if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Hermitian and not transpose not supported");
1755   if (!a->nonzerorowcnt) {
1756     if (!yy) {
1757       ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
1758     }
1759     PetscFunctionReturn(0);
1760   }
1761   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1762   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1763   if (!trans) {
1764     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1765   } else {
1766     if (herm || !cusparsestruct->transgen) {
1767       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
1768       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1769     } else {
1770       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1771       if (!matstruct) {
1772         ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1773         matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1774       }
1775     }
1776   }
1777   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
1778   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
1779 
1780   try {
1781     ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
1782     if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */
1783     else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */
1784     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1785     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
1786       xptr = xarray;
1787       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; /* Use a work vector for SpMv output if compressed */
1788       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
1789     } else {
1790       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; /* Use a work vector for SpMv input if compressed */
1791       dptr = zarray;
1792       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
1793 
1794       if (compressed) {
1795         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
1796 
1797         thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
1798                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
1799                          VecCUDAEqualsReverse());
1800       }
1801     }
1802 
1803     /* csr_spmv does y = alpha*Ax + beta*y */
1804     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1805       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1806       stat = cusparse_csr_spmv(cusparsestruct->handle, opA,
1807                                mat->num_rows, mat->num_cols,
1808                                mat->num_entries, matstruct->alpha, matstruct->descr,
1809                                mat->values->data().get(), mat->row_offsets->data().get(),
1810                                mat->column_indices->data().get(), xptr, beta,
1811                                dptr);CHKERRCUSPARSE(stat);
1812     } else {
1813       if (cusparsestruct->nrows) {
1814         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1815         stat = cusparse_hyb_spmv(cusparsestruct->handle, opA,
1816                                  matstruct->alpha, matstruct->descr, hybMat,
1817                                  xptr, beta,
1818                                  dptr);CHKERRCUSPARSE(stat);
1819       }
1820     }
1821     cerr = WaitForCUDA();CHKERRCUDA(cerr);
1822     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1823 
1824     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
1825       if (yy) { /* MatMultAdd: zz = A*xx + yy */
1826         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
1827           ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */
1828         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
1829           ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
1830         }
1831       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
1832         ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
1833       }
1834 
1835       /* ScatterAdd the result from work vector into the full vector when A is compressed */
1836       if (compressed) {
1837         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
1838 
1839         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1840         thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
1841                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
1842                          VecCUDAPlusEquals());
1843         cerr = WaitForCUDA();CHKERRCUDA(cerr);
1844         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1845       }
1846     } else {
1847       if (yy && yy != zz) {
1848         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
1849       }
1850     }
1851     ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
1852     if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);}
1853     else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);}
1854   } catch(char *ex) {
1855     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1856   }
1857   if (yy) {
1858     ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
1859   } else {
1860     ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr);
1861   }
1862   PetscFunctionReturn(0);
1863 }
1864 
1865 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1866 {
1867   PetscErrorCode ierr;
1868 
1869   PetscFunctionBegin;
1870   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
1871   PetscFunctionReturn(0);
1872 }
1873 
1874 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
1875 {
1876   PetscErrorCode ierr;
1877 
1878   PetscFunctionBegin;
1879   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
1880   if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0);
1881   if (A->factortype == MAT_FACTOR_NONE) {
1882     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1883   }
1884   PetscFunctionReturn(0);
1885 }
1886 
1887 /* --------------------------------------------------------------------------------*/
1888 /*@
1889    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
1890    (the default parallel PETSc format). This matrix will ultimately pushed down
1891    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
1892    assembly performance the user should preallocate the matrix storage by setting
1893    the parameter nz (or the array nnz).  By setting these parameters accurately,
1894    performance during matrix assembly can be increased by more than a factor of 50.
1895 
1896    Collective
1897 
1898    Input Parameters:
1899 +  comm - MPI communicator, set to PETSC_COMM_SELF
1900 .  m - number of rows
1901 .  n - number of columns
1902 .  nz - number of nonzeros per row (same for all rows)
1903 -  nnz - array containing the number of nonzeros in the various rows
1904          (possibly different for each row) or NULL
1905 
1906    Output Parameter:
1907 .  A - the matrix
1908 
1909    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1910    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1911    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
1912 
1913    Notes:
1914    If nnz is given then nz is ignored
1915 
1916    The AIJ format (also called the Yale sparse matrix format or
1917    compressed row storage), is fully compatible with standard Fortran 77
1918    storage.  That is, the stored row and column indices can begin at
1919    either one (as in Fortran) or zero.  See the users' manual for details.
1920 
1921    Specify the preallocated storage with either nz or nnz (not both).
1922    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1923    allocation.  For large problems you MUST preallocate memory or you
1924    will get TERRIBLE performance, see the users' manual chapter on matrices.
1925 
1926    By default, this format uses inodes (identical nodes) when possible, to
1927    improve numerical efficiency of matrix-vector products and solves. We
1928    search for consecutive rows with the same nonzero structure, thereby
1929    reusing matrix information to achieve increased efficiency.
1930 
1931    Level: intermediate
1932 
1933 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
1934 @*/
1935 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1936 {
1937   PetscErrorCode ierr;
1938 
1939   PetscFunctionBegin;
1940   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1941   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
1942   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1943   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
1944   PetscFunctionReturn(0);
1945 }
1946 
1947 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
1948 {
1949   PetscErrorCode ierr;
1950 
1951   PetscFunctionBegin;
1952   if (A->factortype == MAT_FACTOR_NONE) {
1953     ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
1954   } else {
1955     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
1956   }
1957   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
1958   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
1959   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
1960   ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
1961   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
1962   PetscFunctionReturn(0);
1963 }
1964 
1965 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
1966 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
1967 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
1968 {
1969   PetscErrorCode ierr;
1970 
1971   PetscFunctionBegin;
1972   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
1973   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
1974   PetscFunctionReturn(0);
1975 }
1976 
1977 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
1978 {
1979   PetscErrorCode ierr;
1980 
1981   PetscFunctionBegin;
1982   if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0);
1983   /* Currently, there is no case in which an AIJCUSPARSE matrix ever has its offloadmask set to PETS_OFFLOAD_GPU.
1984      If this changes, we need to implement a routine to update the CPU (host) version of the matrix from the GPU one.
1985      Right now, for safety we simply check for PETSC_OFFLOAD_GPU and have MatBindToCPU() error in this case.
1986      TODO: Add MatAIJCUSPARSECopyFromGPU() and make MatBindToCPU() functional for AIJCUSPARSE matries;
1987            can follow the example of MatBindToCPU_SeqAIJViennaCL(). */
1988   if (flg && A->offloadmask == PETSC_OFFLOAD_GPU) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"PETSC_OFFLOAD_GPU should not happen. Please report your use case to petsc-dev@mcs.anl.gov");
1989   /* TODO: add support for this? */
1990   if (flg) {
1991     A->ops->mult                      = MatMult_SeqAIJ;
1992     A->ops->multadd                   = MatMultAdd_SeqAIJ;
1993     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
1994     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
1995     A->ops->multhermitiantranspose    = NULL;
1996     A->ops->multhermitiantransposeadd = NULL;
1997     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
1998     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
1999   } else {
2000     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
2001     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
2002     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
2003     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
2004     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
2005     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
2006     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
2007     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
2008   }
2009   A->boundtocpu = flg;
2010   PetscFunctionReturn(0);
2011 }
2012 
2013 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
2014 {
2015   PetscErrorCode   ierr;
2016   cusparseStatus_t stat;
2017   Mat              B;
2018 
2019   PetscFunctionBegin;
2020   if (reuse == MAT_INITIAL_MATRIX) {
2021     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
2022   } else if (reuse == MAT_REUSE_MATRIX) {
2023     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
2024   }
2025   B = *newmat;
2026 
2027   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
2028   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
2029 
2030   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
2031     if (B->factortype == MAT_FACTOR_NONE) {
2032       Mat_SeqAIJCUSPARSE *spptr;
2033 
2034       ierr = PetscNew(&spptr);CHKERRQ(ierr);
2035       spptr->format = MAT_CUSPARSE_CSR;
2036       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
2037       B->spptr = spptr;
2038     } else {
2039       Mat_SeqAIJCUSPARSETriFactors *spptr;
2040 
2041       ierr = PetscNew(&spptr);CHKERRQ(ierr);
2042       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
2043       B->spptr = spptr;
2044     }
2045     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
2046   }
2047   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
2048   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
2049   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
2050   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
2051   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
2052 
2053   ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr);
2054   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2055   ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
2056   PetscFunctionReturn(0);
2057 }
2058 
2059 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
2060 {
2061   PetscErrorCode ierr;
2062 
2063   PetscFunctionBegin;
2064   ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr);
2065   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
2066   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
2067   PetscFunctionReturn(0);
2068 }
2069 
2070 /*MC
2071    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
2072 
2073    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
2074    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
2075    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
2076 
2077    Options Database Keys:
2078 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
2079 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
2080 -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
2081 
2082   Level: beginner
2083 
2084 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
2085 M*/
2086 
2087 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);
2088 
2089 
2090 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
2091 {
2092   PetscErrorCode ierr;
2093 
2094   PetscFunctionBegin;
2095   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
2096   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
2097   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
2098   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
2099   PetscFunctionReturn(0);
2100 }
2101 
2102 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
2103 {
2104   PetscErrorCode   ierr;
2105   cusparseStatus_t stat;
2106   cusparseHandle_t handle;
2107 
2108   PetscFunctionBegin;
2109   if (*cusparsestruct) {
2110     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr);
2111     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr);
2112     delete (*cusparsestruct)->workVector;
2113     delete (*cusparsestruct)->rowoffsets_gpu;
2114     if (handle = (*cusparsestruct)->handle) {
2115       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
2116     }
2117     ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr);
2118   }
2119   PetscFunctionReturn(0);
2120 }
2121 
2122 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
2123 {
2124   PetscFunctionBegin;
2125   if (*mat) {
2126     delete (*mat)->values;
2127     delete (*mat)->column_indices;
2128     delete (*mat)->row_offsets;
2129     delete *mat;
2130     *mat = 0;
2131   }
2132   PetscFunctionReturn(0);
2133 }
2134 
2135 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
2136 {
2137   cusparseStatus_t stat;
2138   PetscErrorCode   ierr;
2139 
2140   PetscFunctionBegin;
2141   if (*trifactor) {
2142     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
2143     if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
2144     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
2145     delete *trifactor;
2146     *trifactor = 0;
2147   }
2148   PetscFunctionReturn(0);
2149 }
2150 
2151 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
2152 {
2153   CsrMatrix        *mat;
2154   cusparseStatus_t stat;
2155   cudaError_t      err;
2156 
2157   PetscFunctionBegin;
2158   if (*matstruct) {
2159     if ((*matstruct)->mat) {
2160       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
2161         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
2162         stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
2163       } else {
2164         mat = (CsrMatrix*)(*matstruct)->mat;
2165         CsrMatrix_Destroy(&mat);
2166       }
2167     }
2168     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
2169     delete (*matstruct)->cprowIndices;
2170     if ((*matstruct)->alpha)     { err=cudaFree((*matstruct)->alpha);CHKERRCUDA(err); }
2171     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
2172     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
2173     delete *matstruct;
2174     *matstruct = 0;
2175   }
2176   PetscFunctionReturn(0);
2177 }
2178 
2179 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors)
2180 {
2181   PetscErrorCode ierr;
2182 
2183   PetscFunctionBegin;
2184   if (*trifactors) {
2185     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr);
2186     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr);
2187     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr);
2188     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr);
2189     delete (*trifactors)->rpermIndices;
2190     delete (*trifactors)->cpermIndices;
2191     delete (*trifactors)->workVector;
2192     (*trifactors)->rpermIndices = 0;
2193     (*trifactors)->cpermIndices = 0;
2194     (*trifactors)->workVector = 0;
2195   }
2196   PetscFunctionReturn(0);
2197 }
2198 
2199 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
2200 {
2201   PetscErrorCode   ierr;
2202   cusparseHandle_t handle;
2203   cusparseStatus_t stat;
2204 
2205   PetscFunctionBegin;
2206   if (*trifactors) {
2207     ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr);
2208     if (handle = (*trifactors)->handle) {
2209       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
2210     }
2211     ierr = PetscFree(*trifactors);CHKERRQ(ierr);
2212   }
2213   PetscFunctionReturn(0);
2214 }
2215 #else
2216 
2217 /* The following stubs are only provided to satisfy the linker */
2218 
2219 PetscErrorCode MatSeqAIJCUSPARSESetGenerateTranspose(Mat A,PetscBool transgen) {
2220   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE in CUDA 11 is currently not supported!");
2221 }
2222 
2223 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
2224 {
2225   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE in CUDA 11 is currently not supported!");
2226 }
2227 
2228 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
2229 {
2230   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE in CUDA 11 is currently not supported!");
2231 }
2232 
2233 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
2234 {
2235   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE in CUDA 11 is currently not supported!");
2236 }
2237 #endif
2238