xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision ea13f565a9e7eeb46a1941ff623f2af24053d663)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 #define PETSC_SKIP_CXX_COMPLEX_FIX
7 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
8 
9 #include <petscconf.h>
10 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
11 #include <../src/mat/impls/sbaij/seq/sbaij.h>
12 #include <../src/vec/vec/impls/dvecimpl.h>
13 #include <petsc/private/vecimpl.h>
14 #undef VecType
15 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
16 
17 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
18 
19 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
20 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
21 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
22 
23 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
24 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
25 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
26 
27 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
28 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
29 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
30 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
31 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
32 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
33 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
34 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
35 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
36 
37 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
38 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
39 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
40 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
41 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
42 
43 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
44 {
45   cusparseStatus_t   stat;
46   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
47 
48   PetscFunctionBegin;
49   cusparsestruct->stream = stream;
50   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
51   PetscFunctionReturn(0);
52 }
53 
54 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
55 {
56   cusparseStatus_t   stat;
57   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
58 
59   PetscFunctionBegin;
60   if (cusparsestruct->handle != handle) {
61     if (cusparsestruct->handle) {
62       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
63     }
64     cusparsestruct->handle = handle;
65   }
66   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
67   PetscFunctionReturn(0);
68 }
69 
70 PetscErrorCode MatCUSPARSEClearHandle(Mat A)
71 {
72   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
73   PetscFunctionBegin;
74   if (cusparsestruct->handle)
75     cusparsestruct->handle = 0;
76   PetscFunctionReturn(0);
77 }
78 
79 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
80 {
81   PetscFunctionBegin;
82   *type = MATSOLVERCUSPARSE;
83   PetscFunctionReturn(0);
84 }
85 
86 /*MC
87   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
88   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
89   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
90   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
91   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
92   algorithms are not recommended. This class does NOT support direct solver operations.
93 
94   Level: beginner
95 
96 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
97 M*/
98 
99 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
100 {
101   PetscErrorCode ierr;
102   PetscInt       n = A->rmap->n;
103 
104   PetscFunctionBegin;
105   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
106   (*B)->factortype = ftype;
107   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
108   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
109 
110   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
111     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
112     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
113     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
114   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
115     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
116     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
117   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
118 
119   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
120   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
121   PetscFunctionReturn(0);
122 }
123 
124 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
125 {
126   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
127 
128   PetscFunctionBegin;
129   switch (op) {
130   case MAT_CUSPARSE_MULT:
131     cusparsestruct->format = format;
132     break;
133   case MAT_CUSPARSE_ALL:
134     cusparsestruct->format = format;
135     break;
136   default:
137     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
138   }
139   PetscFunctionReturn(0);
140 }
141 
142 /*@
143    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
144    operation. Only the MatMult operation can use different GPU storage formats
145    for MPIAIJCUSPARSE matrices.
146    Not Collective
147 
148    Input Parameters:
149 +  A - Matrix of type SEQAIJCUSPARSE
150 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
151 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
152 
153    Output Parameter:
154 
155    Level: intermediate
156 
157 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
158 @*/
159 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
160 {
161   PetscErrorCode ierr;
162 
163   PetscFunctionBegin;
164   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
165   ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
166   PetscFunctionReturn(0);
167 }
168 
169 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
170 {
171   PetscErrorCode           ierr;
172   MatCUSPARSEStorageFormat format;
173   PetscBool                flg;
174   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
175 
176   PetscFunctionBegin;
177   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
178   if (A->factortype==MAT_FACTOR_NONE) {
179     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
180                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
181     if (flg) {
182       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);
183     }
184   }
185   ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
186                           "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
187   if (flg) {
188     ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
189   }
190   ierr = PetscOptionsTail();CHKERRQ(ierr);
191   PetscFunctionReturn(0);
192 
193 }
194 
195 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
196 {
197   PetscErrorCode ierr;
198 
199   PetscFunctionBegin;
200   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
201   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
202   PetscFunctionReturn(0);
203 }
204 
205 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
206 {
207   PetscErrorCode ierr;
208 
209   PetscFunctionBegin;
210   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
211   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
212   PetscFunctionReturn(0);
213 }
214 
215 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
216 {
217   PetscErrorCode ierr;
218 
219   PetscFunctionBegin;
220   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
221   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
222   PetscFunctionReturn(0);
223 }
224 
225 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
226 {
227   PetscErrorCode ierr;
228 
229   PetscFunctionBegin;
230   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
231   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
232   PetscFunctionReturn(0);
233 }
234 
235 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
236 {
237   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
238   PetscInt                          n = A->rmap->n;
239   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
240   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
241   cusparseStatus_t                  stat;
242   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
243   const MatScalar                   *aa = a->a,*v;
244   PetscInt                          *AiLo, *AjLo;
245   PetscScalar                       *AALo;
246   PetscInt                          i,nz, nzLower, offset, rowOffset;
247   PetscErrorCode                    ierr;
248   cudaError_t                       cerr;
249 
250   PetscFunctionBegin;
251   if (!n) PetscFunctionReturn(0);
252   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
253     try {
254       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
255       nzLower=n+ai[n]-ai[1];
256 
257       /* Allocate Space for the lower triangular matrix */
258       cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
259       cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);
260       cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
261 
262       /* Fill the lower triangular matrix */
263       AiLo[0]  = (PetscInt) 0;
264       AiLo[n]  = nzLower;
265       AjLo[0]  = (PetscInt) 0;
266       AALo[0]  = (MatScalar) 1.0;
267       v        = aa;
268       vi       = aj;
269       offset   = 1;
270       rowOffset= 1;
271       for (i=1; i<n; i++) {
272         nz = ai[i+1] - ai[i];
273         /* additional 1 for the term on the diagonal */
274         AiLo[i]    = rowOffset;
275         rowOffset += nz+1;
276 
277         ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
278         ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
279 
280         offset      += nz;
281         AjLo[offset] = (PetscInt) i;
282         AALo[offset] = (MatScalar) 1.0;
283         offset      += 1;
284 
285         v  += nz;
286         vi += nz;
287       }
288 
289       /* allocate space for the triangular factor information */
290       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
291 
292       /* Create the matrix description */
293       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
294       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
295       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
296       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
297       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
298 
299       /* Create the solve analysis information */
300       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
301 
302       /* set the operation */
303       loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
304 
305       /* set the matrix */
306       loTriFactor->csrMat = new CsrMatrix;
307       loTriFactor->csrMat->num_rows = n;
308       loTriFactor->csrMat->num_cols = n;
309       loTriFactor->csrMat->num_entries = nzLower;
310 
311       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
312       loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
313 
314       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
315       loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
316 
317       loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
318       loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
319 
320       /* perform the solve analysis */
321       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
322                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
323                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
324                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
325 
326       /* assign the pointer. Is this really necessary? */
327       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
328 
329       cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
330       cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
331       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
332       ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
333     } catch(char *ex) {
334       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
335     }
336   }
337   PetscFunctionReturn(0);
338 }
339 
340 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
341 {
342   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
343   PetscInt                          n = A->rmap->n;
344   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
345   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
346   cusparseStatus_t                  stat;
347   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
348   const MatScalar                   *aa = a->a,*v;
349   PetscInt                          *AiUp, *AjUp;
350   PetscScalar                       *AAUp;
351   PetscInt                          i,nz, nzUpper, offset;
352   PetscErrorCode                    ierr;
353   cudaError_t                       cerr;
354 
355   PetscFunctionBegin;
356   if (!n) PetscFunctionReturn(0);
357   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
358     try {
359       /* next, figure out the number of nonzeros in the upper triangular matrix. */
360       nzUpper = adiag[0]-adiag[n];
361 
362       /* Allocate Space for the upper triangular matrix */
363       cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
364       cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
365       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
366 
367       /* Fill the upper triangular matrix */
368       AiUp[0]=(PetscInt) 0;
369       AiUp[n]=nzUpper;
370       offset = nzUpper;
371       for (i=n-1; i>=0; i--) {
372         v  = aa + adiag[i+1] + 1;
373         vi = aj + adiag[i+1] + 1;
374 
375         /* number of elements NOT on the diagonal */
376         nz = adiag[i] - adiag[i+1]-1;
377 
378         /* decrement the offset */
379         offset -= (nz+1);
380 
381         /* first, set the diagonal elements */
382         AjUp[offset] = (PetscInt) i;
383         AAUp[offset] = (MatScalar)1./v[nz];
384         AiUp[i]      = AiUp[i+1] - (nz+1);
385 
386         ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
387         ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
388       }
389 
390       /* allocate space for the triangular factor information */
391       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
392 
393       /* Create the matrix description */
394       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
395       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
396       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
397       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
398       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
399 
400       /* Create the solve analysis information */
401       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
402 
403       /* set the operation */
404       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
405 
406       /* set the matrix */
407       upTriFactor->csrMat = new CsrMatrix;
408       upTriFactor->csrMat->num_rows = n;
409       upTriFactor->csrMat->num_cols = n;
410       upTriFactor->csrMat->num_entries = nzUpper;
411 
412       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
413       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
414 
415       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
416       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
417 
418       upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
419       upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
420 
421       /* perform the solve analysis */
422       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
423                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
424                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
425                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
426 
427       /* assign the pointer. Is this really necessary? */
428       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
429 
430       cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
431       cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
432       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
433       ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
434     } catch(char *ex) {
435       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
436     }
437   }
438   PetscFunctionReturn(0);
439 }
440 
441 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
442 {
443   PetscErrorCode               ierr;
444   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
445   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
446   IS                           isrow = a->row,iscol = a->icol;
447   PetscBool                    row_identity,col_identity;
448   const PetscInt               *r,*c;
449   PetscInt                     n = A->rmap->n;
450 
451   PetscFunctionBegin;
452   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
453   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
454 
455   cusparseTriFactors->workVector = new THRUSTARRAY(n);
456   cusparseTriFactors->nnz=a->nz;
457 
458   A->offloadmask = PETSC_OFFLOAD_BOTH;
459   /* lower triangular indices */
460   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
461   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
462   if (!row_identity) {
463     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
464     cusparseTriFactors->rpermIndices->assign(r, r+n);
465   }
466   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
467 
468   /* upper triangular indices */
469   ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
470   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
471   if (!col_identity) {
472     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
473     cusparseTriFactors->cpermIndices->assign(c, c+n);
474   }
475 
476   if (!row_identity && !col_identity) {
477     ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr);
478   } else if(!row_identity) {
479     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
480   } else if(!col_identity) {
481     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
482   }
483 
484   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
485   PetscFunctionReturn(0);
486 }
487 
488 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
489 {
490   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
491   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
492   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
493   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
494   cusparseStatus_t                  stat;
495   PetscErrorCode                    ierr;
496   cudaError_t                       cerr;
497   PetscInt                          *AiUp, *AjUp;
498   PetscScalar                       *AAUp;
499   PetscScalar                       *AALo;
500   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
501   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
502   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
503   const MatScalar                   *aa = b->a,*v;
504 
505   PetscFunctionBegin;
506   if (!n) PetscFunctionReturn(0);
507   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
508     try {
509       /* Allocate Space for the upper triangular matrix */
510       cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
511       cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
512       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
513       cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
514 
515       /* Fill the upper triangular matrix */
516       AiUp[0]=(PetscInt) 0;
517       AiUp[n]=nzUpper;
518       offset = 0;
519       for (i=0; i<n; i++) {
520         /* set the pointers */
521         v  = aa + ai[i];
522         vj = aj + ai[i];
523         nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
524 
525         /* first, set the diagonal elements */
526         AjUp[offset] = (PetscInt) i;
527         AAUp[offset] = (MatScalar)1.0/v[nz];
528         AiUp[i]      = offset;
529         AALo[offset] = (MatScalar)1.0/v[nz];
530 
531         offset+=1;
532         if (nz>0) {
533           ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
534           ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
535           for (j=offset; j<offset+nz; j++) {
536             AAUp[j] = -AAUp[j];
537             AALo[j] = AAUp[j]/v[nz];
538           }
539           offset+=nz;
540         }
541       }
542 
543       /* allocate space for the triangular factor information */
544       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
545 
546       /* Create the matrix description */
547       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
548       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
549       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
550       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
551       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
552 
553       /* Create the solve analysis information */
554       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
555 
556       /* set the operation */
557       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
558 
559       /* set the matrix */
560       upTriFactor->csrMat = new CsrMatrix;
561       upTriFactor->csrMat->num_rows = A->rmap->n;
562       upTriFactor->csrMat->num_cols = A->cmap->n;
563       upTriFactor->csrMat->num_entries = a->nz;
564 
565       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
566       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
567 
568       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
569       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
570 
571       upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
572       upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
573 
574       /* perform the solve analysis */
575       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
576                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
577                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
578                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
579 
580       /* assign the pointer. Is this really necessary? */
581       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
582 
583       /* allocate space for the triangular factor information */
584       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
585 
586       /* Create the matrix description */
587       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
588       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
589       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
590       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
591       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
592 
593       /* Create the solve analysis information */
594       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
595 
596       /* set the operation */
597       loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
598 
599       /* set the matrix */
600       loTriFactor->csrMat = new CsrMatrix;
601       loTriFactor->csrMat->num_rows = A->rmap->n;
602       loTriFactor->csrMat->num_cols = A->cmap->n;
603       loTriFactor->csrMat->num_entries = a->nz;
604 
605       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
606       loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
607 
608       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
609       loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
610 
611       loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
612       loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
613       ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
614 
615       /* perform the solve analysis */
616       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
617                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
618                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
619                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
620 
621       /* assign the pointer. Is this really necessary? */
622       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
623 
624       A->offloadmask = PETSC_OFFLOAD_BOTH;
625       cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
626       cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
627       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
628       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
629     } catch(char *ex) {
630       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
631     }
632   }
633   PetscFunctionReturn(0);
634 }
635 
636 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
637 {
638   PetscErrorCode               ierr;
639   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
640   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
641   IS                           ip = a->row;
642   const PetscInt               *rip;
643   PetscBool                    perm_identity;
644   PetscInt                     n = A->rmap->n;
645 
646   PetscFunctionBegin;
647   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
648   cusparseTriFactors->workVector = new THRUSTARRAY(n);
649   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
650 
651   /* lower triangular indices */
652   ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
653   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
654   if (!perm_identity) {
655     IS             iip;
656     const PetscInt *irip;
657 
658     ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr);
659     ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr);
660     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
661     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
662     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
663     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
664     ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr);
665     ierr = ISDestroy(&iip);CHKERRQ(ierr);
666     ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr);
667   }
668   ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
669   PetscFunctionReturn(0);
670 }
671 
672 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
673 {
674   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
675   IS             isrow = b->row,iscol = b->col;
676   PetscBool      row_identity,col_identity;
677   PetscErrorCode ierr;
678 
679   PetscFunctionBegin;
680   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
681   /* determine which version of MatSolve needs to be used. */
682   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
683   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
684   if (row_identity && col_identity) {
685     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
686     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
687     B->ops->matsolve = NULL;
688     B->ops->matsolvetranspose = NULL;
689   } else {
690     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
691     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
692     B->ops->matsolve = NULL;
693     B->ops->matsolvetranspose = NULL;
694   }
695 
696   /* get the triangular factors */
697   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
698   PetscFunctionReturn(0);
699 }
700 
701 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
702 {
703   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
704   IS             ip = b->row;
705   PetscBool      perm_identity;
706   PetscErrorCode ierr;
707 
708   PetscFunctionBegin;
709   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
710 
711   /* determine which version of MatSolve needs to be used. */
712   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
713   if (perm_identity) {
714     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
715     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
716     B->ops->matsolve = NULL;
717     B->ops->matsolvetranspose = NULL;
718   } else {
719     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
720     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
721     B->ops->matsolve = NULL;
722     B->ops->matsolvetranspose = NULL;
723   }
724 
725   /* get the triangular factors */
726   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
727   PetscFunctionReturn(0);
728 }
729 
730 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
731 {
732   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
733   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
734   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
735   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
736   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
737   cusparseStatus_t                  stat;
738   cusparseIndexBase_t               indexBase;
739   cusparseMatrixType_t              matrixType;
740   cusparseFillMode_t                fillMode;
741   cusparseDiagType_t                diagType;
742 
743   PetscFunctionBegin;
744 
745   /*********************************************/
746   /* Now the Transpose of the Lower Tri Factor */
747   /*********************************************/
748 
749   /* allocate space for the transpose of the lower triangular factor */
750   loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
751 
752   /* set the matrix descriptors of the lower triangular factor */
753   matrixType = cusparseGetMatType(loTriFactor->descr);
754   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
755   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
756     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
757   diagType = cusparseGetMatDiagType(loTriFactor->descr);
758 
759   /* Create the matrix description */
760   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
761   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
762   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
763   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
764   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
765 
766   /* Create the solve analysis information */
767   stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
768 
769   /* set the operation */
770   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
771 
772   /* allocate GPU space for the CSC of the lower triangular factor*/
773   loTriFactorT->csrMat = new CsrMatrix;
774   loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows;
775   loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols;
776   loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
777   loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1);
778   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries);
779   loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries);
780 
781   /* compute the transpose of the lower triangular factor, i.e. the CSC */
782   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
783                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
784                           loTriFactor->csrMat->values->data().get(),
785                           loTriFactor->csrMat->row_offsets->data().get(),
786                           loTriFactor->csrMat->column_indices->data().get(),
787                           loTriFactorT->csrMat->values->data().get(),
788                           loTriFactorT->csrMat->column_indices->data().get(),
789                           loTriFactorT->csrMat->row_offsets->data().get(),
790                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
791 
792   /* perform the solve analysis on the transposed matrix */
793   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
794                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries,
795                            loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
796                            loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(),
797                            loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
798 
799   /* assign the pointer. Is this really necessary? */
800   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
801 
802   /*********************************************/
803   /* Now the Transpose of the Upper Tri Factor */
804   /*********************************************/
805 
806   /* allocate space for the transpose of the upper triangular factor */
807   upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
808 
809   /* set the matrix descriptors of the upper triangular factor */
810   matrixType = cusparseGetMatType(upTriFactor->descr);
811   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
812   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
813     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
814   diagType = cusparseGetMatDiagType(upTriFactor->descr);
815 
816   /* Create the matrix description */
817   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
818   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
819   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
820   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
821   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
822 
823   /* Create the solve analysis information */
824   stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
825 
826   /* set the operation */
827   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
828 
829   /* allocate GPU space for the CSC of the upper triangular factor*/
830   upTriFactorT->csrMat = new CsrMatrix;
831   upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows;
832   upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols;
833   upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
834   upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1);
835   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries);
836   upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries);
837 
838   /* compute the transpose of the upper triangular factor, i.e. the CSC */
839   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
840                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
841                           upTriFactor->csrMat->values->data().get(),
842                           upTriFactor->csrMat->row_offsets->data().get(),
843                           upTriFactor->csrMat->column_indices->data().get(),
844                           upTriFactorT->csrMat->values->data().get(),
845                           upTriFactorT->csrMat->column_indices->data().get(),
846                           upTriFactorT->csrMat->row_offsets->data().get(),
847                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
848 
849   /* perform the solve analysis on the transposed matrix */
850   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
851                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries,
852                            upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
853                            upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(),
854                            upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
855 
856   /* assign the pointer. Is this really necessary? */
857   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
858   PetscFunctionReturn(0);
859 }
860 
861 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
862 {
863   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
864   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
865   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
866   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
867   cusparseStatus_t             stat;
868   cusparseIndexBase_t          indexBase;
869   cudaError_t                  err;
870   PetscErrorCode               ierr;
871 
872   PetscFunctionBegin;
873 
874   /* allocate space for the triangular factor information */
875   matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
876   stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
877   indexBase = cusparseGetMatIndexBase(matstruct->descr);
878   stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
879   stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
880 
881   /* set alpha and beta */
882   err = cudaMalloc((void **)&(matstructT->alpha),    sizeof(PetscScalar));CHKERRCUDA(err);
883   err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
884   err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
885   err = cudaMemcpy(matstructT->alpha,    &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
886   err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
887   err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
888   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
889 
890   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
891     CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
892     CsrMatrix *matrixT= new CsrMatrix;
893     matrixT->num_rows = A->cmap->n;
894     matrixT->num_cols = A->rmap->n;
895     matrixT->num_entries = a->nz;
896     matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
897     matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
898     matrixT->values = new THRUSTARRAY(a->nz);
899 
900     cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
901     cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
902     /* compute the transpose, i.e. the CSC */
903     indexBase = cusparseGetMatIndexBase(matstruct->descr);
904     stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
905                             A->cmap->n, matrix->num_entries,
906                             matrix->values->data().get(),
907                             cusparsestruct->rowoffsets_gpu->data().get(),
908                             matrix->column_indices->data().get(),
909                             matrixT->values->data().get(),
910                             matrixT->column_indices->data().get(),
911                             matrixT->row_offsets->data().get(),
912                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
913     /* assign the pointer */
914     matstructT->mat = matrixT;
915     ierr = PetscLogCpuToGpu(((A->rmap->n+1)+(a->nz))*sizeof(int)+(3+a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
916   } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
917     /* First convert HYB to CSR */
918     CsrMatrix *temp= new CsrMatrix;
919     temp->num_rows = A->rmap->n;
920     temp->num_cols = A->cmap->n;
921     temp->num_entries = a->nz;
922     temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
923     temp->column_indices = new THRUSTINTARRAY32(a->nz);
924     temp->values = new THRUSTARRAY(a->nz);
925 
926 
927     stat = cusparse_hyb2csr(cusparsestruct->handle,
928                             matstruct->descr, (cusparseHybMat_t)matstruct->mat,
929                             temp->values->data().get(),
930                             temp->row_offsets->data().get(),
931                             temp->column_indices->data().get());CHKERRCUSPARSE(stat);
932 
933     /* Next, convert CSR to CSC (i.e. the matrix transpose) */
934     CsrMatrix *tempT= new CsrMatrix;
935     tempT->num_rows = A->rmap->n;
936     tempT->num_cols = A->cmap->n;
937     tempT->num_entries = a->nz;
938     tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
939     tempT->column_indices = new THRUSTINTARRAY32(a->nz);
940     tempT->values = new THRUSTARRAY(a->nz);
941 
942     stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
943                             temp->num_cols, temp->num_entries,
944                             temp->values->data().get(),
945                             temp->row_offsets->data().get(),
946                             temp->column_indices->data().get(),
947                             tempT->values->data().get(),
948                             tempT->column_indices->data().get(),
949                             tempT->row_offsets->data().get(),
950                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
951 
952     /* Last, convert CSC to HYB */
953     cusparseHybMat_t hybMat;
954     stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
955     cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
956       CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
957     stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
958                             matstructT->descr, tempT->values->data().get(),
959                             tempT->row_offsets->data().get(),
960                             tempT->column_indices->data().get(),
961                             hybMat, 0, partition);CHKERRCUSPARSE(stat);
962 
963     /* assign the pointer */
964     matstructT->mat = hybMat;
965     ierr = PetscLogCpuToGpu((2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)))+3*sizeof(PetscScalar));CHKERRQ(ierr);
966 
967     /* delete temporaries */
968     if (tempT) {
969       if (tempT->values) delete (THRUSTARRAY*) tempT->values;
970       if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
971       if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
972       delete (CsrMatrix*) tempT;
973     }
974     if (temp) {
975       if (temp->values) delete (THRUSTARRAY*) temp->values;
976       if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
977       if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
978       delete (CsrMatrix*) temp;
979     }
980   }
981   /* the compressed row indices is not used for matTranspose */
982   matstructT->cprowIndices = NULL;
983   /* assign the pointer */
984   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
985   PetscFunctionReturn(0);
986 }
987 
988 /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
989 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
990 {
991   PetscInt                              n = xx->map->n;
992   const PetscScalar                     *barray;
993   PetscScalar                           *xarray;
994   thrust::device_ptr<const PetscScalar> bGPU;
995   thrust::device_ptr<PetscScalar>       xGPU;
996   cusparseStatus_t                      stat;
997   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
998   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
999   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1000   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1001   PetscErrorCode                        ierr;
1002   cudaError_t                           cerr;
1003 
1004   PetscFunctionBegin;
1005   /* Analyze the matrix and create the transpose ... on the fly */
1006   if (!loTriFactorT && !upTriFactorT) {
1007     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1008     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1009     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1010   }
1011 
1012   /* Get the GPU pointers */
1013   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1014   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1015   xGPU = thrust::device_pointer_cast(xarray);
1016   bGPU = thrust::device_pointer_cast(barray);
1017 
1018   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1019   /* First, reorder with the row permutation */
1020   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1021                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1022                xGPU);
1023 
1024   /* First, solve U */
1025   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1026                         upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1027                         upTriFactorT->csrMat->values->data().get(),
1028                         upTriFactorT->csrMat->row_offsets->data().get(),
1029                         upTriFactorT->csrMat->column_indices->data().get(),
1030                         upTriFactorT->solveInfo,
1031                         xarray, tempGPU->data().get());CHKERRCUSPARSE(stat);
1032 
1033   /* Then, solve L */
1034   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1035                         loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1036                         loTriFactorT->csrMat->values->data().get(),
1037                         loTriFactorT->csrMat->row_offsets->data().get(),
1038                         loTriFactorT->csrMat->column_indices->data().get(),
1039                         loTriFactorT->solveInfo,
1040                         tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat);
1041 
1042   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1043   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1044                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1045                tempGPU->begin());
1046 
1047   /* Copy the temporary to the full solution. */
1048   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1049 
1050   /* restore */
1051   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1052   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1053   cerr = WaitForGPU();CHKERRCUDA(cerr);
1054   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1055   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1056   PetscFunctionReturn(0);
1057 }
1058 
1059 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1060 {
1061   const PetscScalar                 *barray;
1062   PetscScalar                       *xarray;
1063   cusparseStatus_t                  stat;
1064   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1065   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1066   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1067   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1068   PetscErrorCode                    ierr;
1069   cudaError_t                       cerr;
1070 
1071   PetscFunctionBegin;
1072   /* Analyze the matrix and create the transpose ... on the fly */
1073   if (!loTriFactorT && !upTriFactorT) {
1074     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1075     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1076     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1077   }
1078 
1079   /* Get the GPU pointers */
1080   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1081   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1082 
1083   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1084   /* First, solve U */
1085   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1086                         upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1087                         upTriFactorT->csrMat->values->data().get(),
1088                         upTriFactorT->csrMat->row_offsets->data().get(),
1089                         upTriFactorT->csrMat->column_indices->data().get(),
1090                         upTriFactorT->solveInfo,
1091                         barray, tempGPU->data().get());CHKERRCUSPARSE(stat);
1092 
1093   /* Then, solve L */
1094   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1095                         loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1096                         loTriFactorT->csrMat->values->data().get(),
1097                         loTriFactorT->csrMat->row_offsets->data().get(),
1098                         loTriFactorT->csrMat->column_indices->data().get(),
1099                         loTriFactorT->solveInfo,
1100                         tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat);
1101 
1102   /* restore */
1103   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1104   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1105   cerr = WaitForGPU();CHKERRCUDA(cerr);
1106   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1107   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1108   PetscFunctionReturn(0);
1109 }
1110 
1111 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1112 {
1113   const PetscScalar                     *barray;
1114   PetscScalar                           *xarray;
1115   thrust::device_ptr<const PetscScalar> bGPU;
1116   thrust::device_ptr<PetscScalar>       xGPU;
1117   cusparseStatus_t                      stat;
1118   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1119   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1120   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1121   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1122   PetscErrorCode                        ierr;
1123   cudaError_t                           cerr;
1124 
1125   PetscFunctionBegin;
1126 
1127   /* Get the GPU pointers */
1128   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1129   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1130   xGPU = thrust::device_pointer_cast(xarray);
1131   bGPU = thrust::device_pointer_cast(barray);
1132 
1133   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1134   /* First, reorder with the row permutation */
1135   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1136                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1137                tempGPU->begin());
1138 
1139   /* Next, solve L */
1140   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1141                         loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1142                         loTriFactor->csrMat->values->data().get(),
1143                         loTriFactor->csrMat->row_offsets->data().get(),
1144                         loTriFactor->csrMat->column_indices->data().get(),
1145                         loTriFactor->solveInfo,
1146                         tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat);
1147 
1148   /* Then, solve U */
1149   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1150                         upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1151                         upTriFactor->csrMat->values->data().get(),
1152                         upTriFactor->csrMat->row_offsets->data().get(),
1153                         upTriFactor->csrMat->column_indices->data().get(),
1154                         upTriFactor->solveInfo,
1155                         xarray, tempGPU->data().get());CHKERRCUSPARSE(stat);
1156 
1157   /* Last, reorder with the column permutation */
1158   thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1159                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
1160                xGPU);
1161 
1162   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1163   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1164   cerr = WaitForGPU();CHKERRCUDA(cerr);
1165   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1166   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1167   PetscFunctionReturn(0);
1168 }
1169 
1170 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1171 {
1172   const PetscScalar                 *barray;
1173   PetscScalar                       *xarray;
1174   cusparseStatus_t                  stat;
1175   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1176   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1177   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1178   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1179   PetscErrorCode                    ierr;
1180   cudaError_t                       cerr;
1181 
1182   PetscFunctionBegin;
1183   /* Get the GPU pointers */
1184   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1185   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1186 
1187   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1188   /* First, solve L */
1189   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1190                         loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1191                         loTriFactor->csrMat->values->data().get(),
1192                         loTriFactor->csrMat->row_offsets->data().get(),
1193                         loTriFactor->csrMat->column_indices->data().get(),
1194                         loTriFactor->solveInfo,
1195                         barray, tempGPU->data().get());CHKERRCUSPARSE(stat);
1196 
1197   /* Next, solve U */
1198   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1199                         upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1200                         upTriFactor->csrMat->values->data().get(),
1201                         upTriFactor->csrMat->row_offsets->data().get(),
1202                         upTriFactor->csrMat->column_indices->data().get(),
1203                         upTriFactor->solveInfo,
1204                         tempGPU->data().get(), xarray);CHKERRCUSPARSE(stat);
1205 
1206   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1207   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1208   cerr = WaitForGPU();CHKERRCUDA(cerr);
1209   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1210   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1211   PetscFunctionReturn(0);
1212 }
1213 
1214 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1215 {
1216   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1217   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
1218   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1219   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
1220   PetscErrorCode               ierr;
1221   cusparseStatus_t             stat;
1222   cudaError_t                  err;
1223 
1224   PetscFunctionBegin;
1225   if (A->boundtocpu) PetscFunctionReturn(0);
1226   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1227     ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1228     if (A->was_assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) {
1229       /* Copy values only */
1230       CsrMatrix *mat,*matT;
1231       mat  = (CsrMatrix*)cusparsestruct->mat->mat;
1232       mat->values->assign(a->a, a->a+a->nz);
1233       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
1234 
1235       /* Update matT when it was built before */
1236       if (cusparsestruct->matTranspose) {
1237         cusparseIndexBase_t indexBase = cusparseGetMatIndexBase(cusparsestruct->mat->descr);
1238         matT = (CsrMatrix*)cusparsestruct->matTranspose->mat;
1239         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1240                                  A->cmap->n, mat->num_entries,
1241                                  mat->values->data().get(),
1242                                  cusparsestruct->rowoffsets_gpu->data().get(),
1243                                  mat->column_indices->data().get(),
1244                                  matT->values->data().get(),
1245                                  matT->column_indices->data().get(),
1246                                  matT->row_offsets->data().get(),
1247                                  CUSPARSE_ACTION_NUMERIC,indexBase);CHKERRCUSPARSE(stat);
1248       }
1249     } else {
1250       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr);
1251       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->matTranspose,cusparsestruct->format);CHKERRQ(ierr);
1252       delete cusparsestruct->workVector;
1253       delete cusparsestruct->rowoffsets_gpu;
1254       try {
1255         if (a->compressedrow.use) {
1256           m    = a->compressedrow.nrows;
1257           ii   = a->compressedrow.i;
1258           ridx = a->compressedrow.rindex;
1259         } else {
1260           m    = A->rmap->n;
1261           ii   = a->i;
1262           ridx = NULL; /* In this case, one should not dereference ridx */
1263         }
1264         cusparsestruct->nrows = m;
1265 
1266         /* allocate space for the triangular factor information */
1267         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1268         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
1269         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
1270         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1271 
1272         err = cudaMalloc((void **)&(matstruct->alpha),    sizeof(PetscScalar));CHKERRCUDA(err);
1273         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1274         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1275         err = cudaMemcpy(matstruct->alpha,    &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1276         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1277         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1278         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1279 
1280         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1281         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1282           /* set the matrix */
1283           CsrMatrix *matrix= new CsrMatrix;
1284           matrix->num_rows = m;
1285           matrix->num_cols = A->cmap->n;
1286           matrix->num_entries = a->nz;
1287           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1288           matrix->row_offsets->assign(ii, ii + m+1);
1289 
1290           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1291           matrix->column_indices->assign(a->j, a->j+a->nz);
1292 
1293           matrix->values = new THRUSTARRAY(a->nz);
1294           matrix->values->assign(a->a, a->a+a->nz);
1295 
1296           /* assign the pointer */
1297           matstruct->mat = matrix;
1298 
1299         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1300           CsrMatrix *matrix= new CsrMatrix;
1301           matrix->num_rows = m;
1302           matrix->num_cols = A->cmap->n;
1303           matrix->num_entries = a->nz;
1304           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1305           matrix->row_offsets->assign(ii, ii + m+1);
1306 
1307           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1308           matrix->column_indices->assign(a->j, a->j+a->nz);
1309 
1310           matrix->values = new THRUSTARRAY(a->nz);
1311           matrix->values->assign(a->a, a->a+a->nz);
1312 
1313           cusparseHybMat_t hybMat;
1314           stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1315           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1316             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1317           stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols,
1318               matstruct->descr, matrix->values->data().get(),
1319               matrix->row_offsets->data().get(),
1320               matrix->column_indices->data().get(),
1321               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1322           /* assign the pointer */
1323           matstruct->mat = hybMat;
1324 
1325           if (matrix) {
1326             if (matrix->values) delete (THRUSTARRAY*)matrix->values;
1327             if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices;
1328             if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets;
1329             delete (CsrMatrix*)matrix;
1330           }
1331         }
1332 
1333         /* assign the compressed row indices */
1334         if (a->compressedrow.use) {
1335           cusparsestruct->workVector = new THRUSTARRAY(m);
1336           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
1337           matstruct->cprowIndices->assign(ridx,ridx+m);
1338           tmp = m;
1339         } else {
1340           cusparsestruct->workVector = NULL;
1341           matstruct->cprowIndices    = NULL;
1342           tmp = 0;
1343         }
1344         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
1345 
1346         /* assign the pointer */
1347         cusparsestruct->mat = matstruct;
1348       } catch(char *ex) {
1349         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1350       }
1351       cusparsestruct->nonzerostate = A->nonzerostate;
1352     }
1353     err  = WaitForGPU();CHKERRCUDA(err);
1354     A->offloadmask = PETSC_OFFLOAD_BOTH;
1355     ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1356   }
1357   PetscFunctionReturn(0);
1358 }
1359 
1360 struct VecCUDAPlusEquals
1361 {
1362   template <typename Tuple>
1363   __host__ __device__
1364   void operator()(Tuple t)
1365   {
1366     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1367   }
1368 };
1369 
1370 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1371 {
1372   PetscErrorCode ierr;
1373 
1374   PetscFunctionBegin;
1375   ierr = MatMultAdd_SeqAIJCUSPARSE(A,xx,NULL,yy);CHKERRQ(ierr);
1376   PetscFunctionReturn(0);
1377 }
1378 
1379 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1380 {
1381   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1382   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1383   Mat_SeqAIJCUSPARSEMultStruct *matstructT;
1384   const PetscScalar            *xarray;
1385   PetscScalar                  *yarray;
1386   PetscErrorCode               ierr;
1387   cudaError_t                  cerr;
1388   cusparseStatus_t             stat;
1389 
1390   PetscFunctionBegin;
1391   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1392   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1393   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1394   if (!matstructT) {
1395     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1396     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1397   }
1398   ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1399   ierr = VecCUDAGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
1400   if (yy->map->n) {
1401     PetscInt                     n = yy->map->n;
1402     cudaError_t                  err;
1403     err = cudaMemset(yarray,0,n*sizeof(PetscScalar));CHKERRCUDA(err); /* hack to fix numerical errors from reading output vector yy, apparently */
1404   }
1405 
1406   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1407   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1408     CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1409     stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1410                              mat->num_rows, mat->num_cols,
1411                              mat->num_entries, matstructT->alpha, matstructT->descr,
1412                              mat->values->data().get(), mat->row_offsets->data().get(),
1413                              mat->column_indices->data().get(), xarray, matstructT->beta_zero,
1414                              yarray);CHKERRCUSPARSE(stat);
1415   } else {
1416     cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1417     stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1418                              matstructT->alpha, matstructT->descr, hybMat,
1419                              xarray, matstructT->beta_zero,
1420                              yarray);CHKERRCUSPARSE(stat);
1421   }
1422   ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1423   ierr = VecCUDARestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
1424   cerr = WaitForGPU();CHKERRCUDA(cerr);
1425   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1426   ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
1427   PetscFunctionReturn(0);
1428 }
1429 
1430 
1431 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1432 {
1433   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1434   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1435   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
1436   const PetscScalar            *xarray;
1437   PetscScalar                  *zarray,*dptr,*beta;
1438   PetscErrorCode               ierr;
1439   cudaError_t                  cerr;
1440   cusparseStatus_t             stat;
1441   PetscBool                    compressed = a->compressedrow.use; /* Does the matrix use compressed rows (i.e., drop zero rows)? */
1442 
1443   PetscFunctionBegin;
1444   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1445   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1446   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1447   try {
1448     ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1449 
1450     if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */
1451     else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */
1452 
1453     dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; /* Use a work vector for SpMv result if compressed */
1454     beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
1455 
1456     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1457     /* csr_spmv does y = alpha*Ax + beta*y */
1458     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1459       /* here we need to be careful to set the number of rows in the multiply to the
1460          number of compressed rows in the matrix ... which is equivalent to the
1461          size of the workVector */
1462       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1463       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1464                                mat->num_rows, mat->num_cols,
1465                                mat->num_entries, matstruct->alpha, matstruct->descr,
1466                                mat->values->data().get(), mat->row_offsets->data().get(),
1467                                mat->column_indices->data().get(), xarray, beta,
1468                                dptr);CHKERRCUSPARSE(stat);
1469     } else {
1470       if (cusparsestruct->nrows) {
1471         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1472         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1473                                  matstruct->alpha, matstruct->descr, hybMat,
1474                                  xarray, beta,
1475                                  dptr);CHKERRCUSPARSE(stat);
1476       }
1477     }
1478     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1479 
1480     if (yy) { /* MatMultAdd: zz = A*xx + yy */
1481       if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
1482         ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */
1483       } else if (zz != yy) { /* A's not compreseed. zz already contains A*xx, and we just need to add yy */
1484         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
1485       }
1486     } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
1487       ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
1488     }
1489 
1490     /* ScatterAdd the result from work vector into the full vector when A is compressed */
1491     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1492     if (compressed) {
1493       thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
1494       thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
1495                        thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(),
1496                        VecCUDAPlusEquals());
1497     }
1498     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1499     ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1500     if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);}
1501     else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);}
1502   } catch(char *ex) {
1503     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1504   }
1505   cerr = WaitForGPU();CHKERRCUDA(cerr);
1506   ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
1507   PetscFunctionReturn(0);
1508 }
1509 
1510 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1511 {
1512   Mat_SeqAIJ                      *a = (Mat_SeqAIJ*)A->data;
1513   Mat_SeqAIJCUSPARSE              *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1514   Mat_SeqAIJCUSPARSEMultStruct    *matstructT;
1515   const PetscScalar               *xarray;
1516   PetscScalar                     *zarray,*beta;
1517   PetscErrorCode                  ierr;
1518   cudaError_t                     cerr;
1519   cusparseStatus_t                stat;
1520 
1521   PetscFunctionBegin;
1522   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1523   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1524   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1525   if (!matstructT) {
1526     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1527     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1528   }
1529 
1530   /* Note unlike Mat, MatTranspose uses non-compressed row storage */
1531   try {
1532     ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);
1533     ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1534     ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);
1535     beta = (yy == zz) ? matstructT->beta_one : matstructT->beta_zero;
1536 
1537     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1538     /* multiply add with matrix transpose */
1539     if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1540       CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1541       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1542                                mat->num_rows, mat->num_cols,
1543                                mat->num_entries, matstructT->alpha, matstructT->descr,
1544                                mat->values->data().get(), mat->row_offsets->data().get(),
1545                                mat->column_indices->data().get(), xarray, beta,
1546                                zarray);CHKERRCUSPARSE(stat);
1547     } else {
1548       cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1549       if (cusparsestruct->nrows) {
1550         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1551                                  matstructT->alpha, matstructT->descr, hybMat,
1552                                  xarray, beta,
1553                                  zarray);CHKERRCUSPARSE(stat);
1554       }
1555     }
1556     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1557 
1558     if (zz != yy) {ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr);}
1559     ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1560     ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);
1561   } catch(char *ex) {
1562     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1563   }
1564   cerr = WaitForGPU();CHKERRCUDA(cerr);
1565   ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
1566   PetscFunctionReturn(0);
1567 }
1568 
1569 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
1570 {
1571   PetscErrorCode ierr;
1572 
1573   PetscFunctionBegin;
1574   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
1575   if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0);
1576   if (A->factortype == MAT_FACTOR_NONE) {
1577     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1578   }
1579   A->ops->mult             = MatMult_SeqAIJCUSPARSE;
1580   A->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1581   A->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1582   A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1583   PetscFunctionReturn(0);
1584 }
1585 
1586 /* --------------------------------------------------------------------------------*/
1587 /*@
1588    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
1589    (the default parallel PETSc format). This matrix will ultimately pushed down
1590    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
1591    assembly performance the user should preallocate the matrix storage by setting
1592    the parameter nz (or the array nnz).  By setting these parameters accurately,
1593    performance during matrix assembly can be increased by more than a factor of 50.
1594 
1595    Collective
1596 
1597    Input Parameters:
1598 +  comm - MPI communicator, set to PETSC_COMM_SELF
1599 .  m - number of rows
1600 .  n - number of columns
1601 .  nz - number of nonzeros per row (same for all rows)
1602 -  nnz - array containing the number of nonzeros in the various rows
1603          (possibly different for each row) or NULL
1604 
1605    Output Parameter:
1606 .  A - the matrix
1607 
1608    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1609    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1610    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
1611 
1612    Notes:
1613    If nnz is given then nz is ignored
1614 
1615    The AIJ format (also called the Yale sparse matrix format or
1616    compressed row storage), is fully compatible with standard Fortran 77
1617    storage.  That is, the stored row and column indices can begin at
1618    either one (as in Fortran) or zero.  See the users' manual for details.
1619 
1620    Specify the preallocated storage with either nz or nnz (not both).
1621    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1622    allocation.  For large problems you MUST preallocate memory or you
1623    will get TERRIBLE performance, see the users' manual chapter on matrices.
1624 
1625    By default, this format uses inodes (identical nodes) when possible, to
1626    improve numerical efficiency of matrix-vector products and solves. We
1627    search for consecutive rows with the same nonzero structure, thereby
1628    reusing matrix information to achieve increased efficiency.
1629 
1630    Level: intermediate
1631 
1632 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
1633 @*/
1634 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1635 {
1636   PetscErrorCode ierr;
1637 
1638   PetscFunctionBegin;
1639   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1640   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
1641   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1642   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
1643   PetscFunctionReturn(0);
1644 }
1645 
1646 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
1647 {
1648   PetscErrorCode   ierr;
1649 
1650   PetscFunctionBegin;
1651   if (A->factortype==MAT_FACTOR_NONE) {
1652     if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) {
1653       ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
1654     }
1655   } else {
1656     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
1657   }
1658   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
1659   PetscFunctionReturn(0);
1660 }
1661 
1662 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
1663 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
1664 {
1665   PetscErrorCode ierr;
1666   Mat C;
1667   cusparseStatus_t stat;
1668   cusparseHandle_t handle=0;
1669 
1670   PetscFunctionBegin;
1671   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
1672   C    = *B;
1673   ierr = PetscFree(C->defaultvectype);CHKERRQ(ierr);
1674   ierr = PetscStrallocpy(VECCUDA,&C->defaultvectype);CHKERRQ(ierr);
1675 
1676   /* inject CUSPARSE-specific stuff */
1677   if (C->factortype==MAT_FACTOR_NONE) {
1678     /* you cannot check the inode.use flag here since the matrix was just created.
1679        now build a GPU matrix data structure */
1680     C->spptr = new Mat_SeqAIJCUSPARSE;
1681     ((Mat_SeqAIJCUSPARSE*)C->spptr)->mat            = 0;
1682     ((Mat_SeqAIJCUSPARSE*)C->spptr)->matTranspose   = 0;
1683     ((Mat_SeqAIJCUSPARSE*)C->spptr)->workVector     = 0;
1684     ((Mat_SeqAIJCUSPARSE*)C->spptr)->rowoffsets_gpu = 0;
1685     ((Mat_SeqAIJCUSPARSE*)C->spptr)->format         = MAT_CUSPARSE_CSR;
1686     ((Mat_SeqAIJCUSPARSE*)C->spptr)->stream         = 0;
1687     stat = cusparseCreate(&handle);CHKERRCUSPARSE(stat);
1688     ((Mat_SeqAIJCUSPARSE*)C->spptr)->handle         = handle;
1689     ((Mat_SeqAIJCUSPARSE*)C->spptr)->nonzerostate   = 0;
1690   } else {
1691     /* NEXT, set the pointers to the triangular factors */
1692     C->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1693     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtr          = 0;
1694     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtr          = 0;
1695     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtrTranspose = 0;
1696     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtrTranspose = 0;
1697     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->rpermIndices            = 0;
1698     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->cpermIndices            = 0;
1699     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->workVector              = 0;
1700     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle                  = 0;
1701     stat = cusparseCreate(&handle);CHKERRCUSPARSE(stat);
1702     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle                  = handle;
1703     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->nnz                     = 0;
1704   }
1705 
1706   C->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
1707   C->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
1708   C->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
1709   C->ops->mult             = MatMult_SeqAIJCUSPARSE;
1710   C->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1711   C->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1712   C->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1713   C->ops->duplicate        = MatDuplicate_SeqAIJCUSPARSE;
1714   C->ops->bindtocpu        = MatBindToCPU_SeqAIJCUSPARSE;
1715 
1716   ierr = PetscObjectChangeTypeName((PetscObject)C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1717 
1718   C->boundtocpu  = PETSC_FALSE;
1719   C->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
1720 
1721   ierr = PetscObjectComposeFunction((PetscObject)C, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
1722   PetscFunctionReturn(0);
1723 }
1724 
1725 
1726 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
1727 {
1728   PetscFunctionBegin;
1729   /* Currently, there is no case in which an AIJCUSPARSE matrix ever has its offloadmask set to PETS_OFFLOAD_GPU.
1730      If this changes, we need to implement a routine to update the CPU (host) version of the matrix from the GPU one.
1731      Right now, for safety we simply check for PETSC_OFFLOAD_GPU and have MatBindToCPU() do nothing in this case.
1732      TODO: Add MatAIJCUSPARSECopyFromGPU() and make MatBindToCPU() functional for AIJCUSPARSE matries;
1733            can follow the example of MatBindToCPU_SeqAIJViennaCL(). */
1734   if (A->offloadmask == PETSC_OFFLOAD_GPU) PetscFunctionReturn(0);
1735   if (flg) {
1736     A->ops->mult             = MatMult_SeqAIJ;
1737     A->ops->multadd          = MatMultAdd_SeqAIJ;
1738     A->ops->multtranspose    = MatMultTranspose_SeqAIJ;
1739     A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ;
1740     A->ops->assemblyend      = MatAssemblyEnd_SeqAIJ;
1741     A->ops->duplicate        = MatDuplicate_SeqAIJ;
1742   } else {
1743     A->ops->mult             = MatMult_SeqAIJCUSPARSE;
1744     A->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1745     A->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1746     A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1747     A->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
1748     A->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
1749     A->ops->duplicate        = MatDuplicate_SeqAIJCUSPARSE;
1750   }
1751   A->boundtocpu = flg;
1752   PetscFunctionReturn(0);
1753 }
1754 
1755 PETSC_EXTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat B)
1756 {
1757   PetscErrorCode ierr;
1758   cusparseStatus_t stat;
1759   cusparseHandle_t handle=0;
1760 
1761   PetscFunctionBegin;
1762   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
1763   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
1764 
1765   if (B->factortype==MAT_FACTOR_NONE) {
1766     /* you cannot check the inode.use flag here since the matrix was just created.
1767        now build a GPU matrix data structure */
1768     B->spptr = new Mat_SeqAIJCUSPARSE;
1769     ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat            = 0;
1770     ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose   = 0;
1771     ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector     = 0;
1772     ((Mat_SeqAIJCUSPARSE*)B->spptr)->rowoffsets_gpu = 0;
1773     ((Mat_SeqAIJCUSPARSE*)B->spptr)->format         = MAT_CUSPARSE_CSR;
1774     ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream         = 0;
1775     stat = cusparseCreate(&handle);CHKERRCUSPARSE(stat);
1776     ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle         = handle;
1777     ((Mat_SeqAIJCUSPARSE*)B->spptr)->nonzerostate   = 0;
1778   } else {
1779     /* NEXT, set the pointers to the triangular factors */
1780     B->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1781     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr          = 0;
1782     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr          = 0;
1783     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0;
1784     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0;
1785     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices            = 0;
1786     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices            = 0;
1787     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector              = 0;
1788     stat = cusparseCreate(&handle);CHKERRCUSPARSE(stat);
1789     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle                  = handle;
1790     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz                     = 0;
1791   }
1792 
1793   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
1794   B->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
1795   B->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
1796   B->ops->mult             = MatMult_SeqAIJCUSPARSE;
1797   B->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1798   B->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1799   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1800   B->ops->duplicate        = MatDuplicate_SeqAIJCUSPARSE;
1801   B->ops->bindtocpu        = MatBindToCPU_SeqAIJCUSPARSE;
1802 
1803   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1804 
1805   B->boundtocpu  = PETSC_FALSE;
1806   B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
1807 
1808   ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
1809   PetscFunctionReturn(0);
1810 }
1811 
1812 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
1813 {
1814   PetscErrorCode ierr;
1815 
1816   PetscFunctionBegin;
1817   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
1818   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B);CHKERRQ(ierr);
1819   PetscFunctionReturn(0);
1820 }
1821 
1822 /*MC
1823    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
1824 
1825    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
1826    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
1827    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
1828 
1829    Options Database Keys:
1830 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
1831 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1832 -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1833 
1834   Level: beginner
1835 
1836 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
1837 M*/
1838 
1839 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);
1840 
1841 
1842 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
1843 {
1844   PetscErrorCode ierr;
1845 
1846   PetscFunctionBegin;
1847   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1848   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1849   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1850   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1851   PetscFunctionReturn(0);
1852 }
1853 
1854 
1855 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
1856 {
1857   cusparseStatus_t stat;
1858   cusparseHandle_t handle;
1859 
1860   PetscFunctionBegin;
1861   if (*cusparsestruct) {
1862     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);
1863     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);
1864     delete (*cusparsestruct)->workVector;
1865     delete (*cusparsestruct)->rowoffsets_gpu;
1866     if (handle = (*cusparsestruct)->handle) {
1867       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
1868     }
1869     delete *cusparsestruct;
1870     *cusparsestruct = 0;
1871   }
1872   PetscFunctionReturn(0);
1873 }
1874 
1875 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
1876 {
1877   PetscFunctionBegin;
1878   if (*mat) {
1879     delete (*mat)->values;
1880     delete (*mat)->column_indices;
1881     delete (*mat)->row_offsets;
1882     delete *mat;
1883     *mat = 0;
1884   }
1885   PetscFunctionReturn(0);
1886 }
1887 
1888 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
1889 {
1890   cusparseStatus_t stat;
1891   PetscErrorCode   ierr;
1892 
1893   PetscFunctionBegin;
1894   if (*trifactor) {
1895     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
1896     if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
1897     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
1898     delete *trifactor;
1899     *trifactor = 0;
1900   }
1901   PetscFunctionReturn(0);
1902 }
1903 
1904 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
1905 {
1906   CsrMatrix        *mat;
1907   cusparseStatus_t stat;
1908   cudaError_t      err;
1909 
1910   PetscFunctionBegin;
1911   if (*matstruct) {
1912     if ((*matstruct)->mat) {
1913       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
1914         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
1915         stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
1916       } else {
1917         mat = (CsrMatrix*)(*matstruct)->mat;
1918         CsrMatrix_Destroy(&mat);
1919       }
1920     }
1921     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
1922     delete (*matstruct)->cprowIndices;
1923     if ((*matstruct)->alpha)     { err=cudaFree((*matstruct)->alpha);CHKERRCUDA(err); }
1924     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
1925     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
1926     delete *matstruct;
1927     *matstruct = 0;
1928   }
1929   PetscFunctionReturn(0);
1930 }
1931 
1932 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
1933 {
1934   cusparseHandle_t handle;
1935   cusparseStatus_t stat;
1936 
1937   PetscFunctionBegin;
1938   if (*trifactors) {
1939     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);
1940     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);
1941     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);
1942     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);
1943     delete (*trifactors)->rpermIndices;
1944     delete (*trifactors)->cpermIndices;
1945     delete (*trifactors)->workVector;
1946     if (handle = (*trifactors)->handle) {
1947       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
1948     }
1949     delete *trifactors;
1950     *trifactors = 0;
1951   }
1952   PetscFunctionReturn(0);
1953 }
1954 
1955