xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 81902715bc94d6038a0c0df874c84e105aea95e4)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 #define PETSC_SKIP_CXX_COMPLEX_FIX
7 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
8 
9 #include <petscconf.h>
10 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
11 #include <../src/mat/impls/sbaij/seq/sbaij.h>
12 #include <../src/vec/vec/impls/dvecimpl.h>
13 #include <petsc/private/vecimpl.h>
14 #undef VecType
15 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
16 
17 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
18 
19 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
20 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
21 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
22 
23 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
24 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
25 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
26 
27 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
28 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
29 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
30 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
31 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
32 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
33 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
34 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
35 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
36 
37 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
38 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
39 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
40 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
41 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
42 
43 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
44 {
45   cusparseStatus_t   stat;
46   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
47 
48   PetscFunctionBegin;
49   cusparsestruct->stream = stream;
50   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUDA(stat);
51   PetscFunctionReturn(0);
52 }
53 
54 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
55 {
56   cusparseStatus_t   stat;
57   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
58 
59   PetscFunctionBegin;
60   if (cusparsestruct->handle != handle) {
61     if (cusparsestruct->handle) {
62       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUDA(stat);
63     }
64     cusparsestruct->handle = handle;
65   }
66   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
67   PetscFunctionReturn(0);
68 }
69 
70 PetscErrorCode MatCUSPARSEClearHandle(Mat A)
71 {
72   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
73   PetscFunctionBegin;
74   if (cusparsestruct->handle)
75     cusparsestruct->handle = 0;
76   PetscFunctionReturn(0);
77 }
78 
79 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
80 {
81   PetscFunctionBegin;
82   *type = MATSOLVERCUSPARSE;
83   PetscFunctionReturn(0);
84 }
85 
86 /*MC
87   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
88   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
89   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
90   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
91   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
92   algorithms are not recommended. This class does NOT support direct solver operations.
93 
94   Level: beginner
95 
96 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
97 M*/
98 
99 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
100 {
101   PetscErrorCode ierr;
102   PetscInt       n = A->rmap->n;
103 
104   PetscFunctionBegin;
105   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
106   (*B)->factortype = ftype;
107   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
108   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
109 
110   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
111     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
112     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
113     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
114   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
115     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
116     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
117   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
118 
119   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
120   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
121   PetscFunctionReturn(0);
122 }
123 
124 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
125 {
126   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
127 
128   PetscFunctionBegin;
129   switch (op) {
130   case MAT_CUSPARSE_MULT:
131     cusparsestruct->format = format;
132     break;
133   case MAT_CUSPARSE_ALL:
134     cusparsestruct->format = format;
135     break;
136   default:
137     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
138   }
139   PetscFunctionReturn(0);
140 }
141 
142 /*@
143    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
144    operation. Only the MatMult operation can use different GPU storage formats
145    for MPIAIJCUSPARSE matrices.
146    Not Collective
147 
148    Input Parameters:
149 +  A - Matrix of type SEQAIJCUSPARSE
150 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
151 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
152 
153    Output Parameter:
154 
155    Level: intermediate
156 
157 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
158 @*/
159 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
160 {
161   PetscErrorCode ierr;
162 
163   PetscFunctionBegin;
164   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
165   ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
166   PetscFunctionReturn(0);
167 }
168 
169 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
170 {
171   PetscErrorCode           ierr;
172   MatCUSPARSEStorageFormat format;
173   PetscBool                flg;
174   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
175 
176   PetscFunctionBegin;
177   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
178   if (A->factortype==MAT_FACTOR_NONE) {
179     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
180                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
181     if (flg) {
182       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);
183     }
184   }
185   ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
186                           "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
187   if (flg) {
188     ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
189   }
190   ierr = PetscOptionsTail();CHKERRQ(ierr);
191   PetscFunctionReturn(0);
192 
193 }
194 
195 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
196 {
197   PetscErrorCode ierr;
198 
199   PetscFunctionBegin;
200   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
201   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
202   PetscFunctionReturn(0);
203 }
204 
205 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
206 {
207   PetscErrorCode ierr;
208 
209   PetscFunctionBegin;
210   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
211   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
212   PetscFunctionReturn(0);
213 }
214 
215 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
216 {
217   PetscErrorCode ierr;
218 
219   PetscFunctionBegin;
220   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
221   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
222   PetscFunctionReturn(0);
223 }
224 
225 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
226 {
227   PetscErrorCode ierr;
228 
229   PetscFunctionBegin;
230   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
231   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
232   PetscFunctionReturn(0);
233 }
234 
235 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
236 {
237   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
238   PetscInt                          n = A->rmap->n;
239   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
240   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
241   cusparseStatus_t                  stat;
242   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
243   const MatScalar                   *aa = a->a,*v;
244   PetscInt                          *AiLo, *AjLo;
245   PetscScalar                       *AALo;
246   PetscInt                          i,nz, nzLower, offset, rowOffset;
247   PetscErrorCode                    ierr;
248 
249   PetscFunctionBegin;
250   if (!n) PetscFunctionReturn(0);
251   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
252     try {
253       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
254       nzLower=n+ai[n]-ai[1];
255 
256       /* Allocate Space for the lower triangular matrix */
257       ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
258       ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(ierr);
259       ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(ierr);
260 
261       /* Fill the lower triangular matrix */
262       AiLo[0]  = (PetscInt) 0;
263       AiLo[n]  = nzLower;
264       AjLo[0]  = (PetscInt) 0;
265       AALo[0]  = (MatScalar) 1.0;
266       v        = aa;
267       vi       = aj;
268       offset   = 1;
269       rowOffset= 1;
270       for (i=1; i<n; i++) {
271         nz = ai[i+1] - ai[i];
272         /* additional 1 for the term on the diagonal */
273         AiLo[i]    = rowOffset;
274         rowOffset += nz+1;
275 
276         ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
277         ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
278 
279         offset      += nz;
280         AjLo[offset] = (PetscInt) i;
281         AALo[offset] = (MatScalar) 1.0;
282         offset      += 1;
283 
284         v  += nz;
285         vi += nz;
286       }
287 
288       /* allocate space for the triangular factor information */
289       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
290 
291       /* Create the matrix description */
292       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat);
293       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
294       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
295       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUDA(stat);
296       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat);
297 
298       /* Create the solve analysis information */
299       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat);
300 
301       /* set the operation */
302       loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
303 
304       /* set the matrix */
305       loTriFactor->csrMat = new CsrMatrix;
306       loTriFactor->csrMat->num_rows = n;
307       loTriFactor->csrMat->num_cols = n;
308       loTriFactor->csrMat->num_entries = nzLower;
309 
310       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
311       loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
312 
313       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
314       loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
315 
316       loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
317       loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
318 
319       /* perform the solve analysis */
320       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
321                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
322                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
323                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat);
324 
325       /* assign the pointer. Is this really necessary? */
326       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
327 
328       ierr = cudaFreeHost(AiLo);CHKERRCUDA(ierr);
329       ierr = cudaFreeHost(AjLo);CHKERRCUDA(ierr);
330       ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr);
331       ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
332     } catch(char *ex) {
333       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
334     }
335   }
336   PetscFunctionReturn(0);
337 }
338 
339 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
340 {
341   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
342   PetscInt                          n = A->rmap->n;
343   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
344   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
345   cusparseStatus_t                  stat;
346   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
347   const MatScalar                   *aa = a->a,*v;
348   PetscInt                          *AiUp, *AjUp;
349   PetscScalar                       *AAUp;
350   PetscInt                          i,nz, nzUpper, offset;
351   PetscErrorCode                    ierr;
352 
353   PetscFunctionBegin;
354   if (!n) PetscFunctionReturn(0);
355   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
356     try {
357       /* next, figure out the number of nonzeros in the upper triangular matrix. */
358       nzUpper = adiag[0]-adiag[n];
359 
360       /* Allocate Space for the upper triangular matrix */
361       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
362       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr);
363       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
364 
365       /* Fill the upper triangular matrix */
366       AiUp[0]=(PetscInt) 0;
367       AiUp[n]=nzUpper;
368       offset = nzUpper;
369       for (i=n-1; i>=0; i--) {
370         v  = aa + adiag[i+1] + 1;
371         vi = aj + adiag[i+1] + 1;
372 
373         /* number of elements NOT on the diagonal */
374         nz = adiag[i] - adiag[i+1]-1;
375 
376         /* decrement the offset */
377         offset -= (nz+1);
378 
379         /* first, set the diagonal elements */
380         AjUp[offset] = (PetscInt) i;
381         AAUp[offset] = (MatScalar)1./v[nz];
382         AiUp[i]      = AiUp[i+1] - (nz+1);
383 
384         ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
385         ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
386       }
387 
388       /* allocate space for the triangular factor information */
389       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
390 
391       /* Create the matrix description */
392       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat);
393       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
394       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
395       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
396       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat);
397 
398       /* Create the solve analysis information */
399       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat);
400 
401       /* set the operation */
402       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
403 
404       /* set the matrix */
405       upTriFactor->csrMat = new CsrMatrix;
406       upTriFactor->csrMat->num_rows = n;
407       upTriFactor->csrMat->num_cols = n;
408       upTriFactor->csrMat->num_entries = nzUpper;
409 
410       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
411       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
412 
413       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
414       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
415 
416       upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
417       upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
418 
419       /* perform the solve analysis */
420       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
421                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
422                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
423                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat);
424 
425       /* assign the pointer. Is this really necessary? */
426       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
427 
428       ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr);
429       ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr);
430       ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr);
431       ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
432     } catch(char *ex) {
433       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
434     }
435   }
436   PetscFunctionReturn(0);
437 }
438 
439 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
440 {
441   PetscErrorCode               ierr;
442   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
443   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
444   IS                           isrow = a->row,iscol = a->icol;
445   PetscBool                    row_identity,col_identity;
446   const PetscInt               *r,*c;
447   PetscInt                     n = A->rmap->n;
448 
449   PetscFunctionBegin;
450   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
451   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
452 
453   cusparseTriFactors->workVector = new THRUSTARRAY(n);
454   cusparseTriFactors->nnz=a->nz;
455 
456   A->offloadmask = PETSC_OFFLOAD_BOTH;
457   /* lower triangular indices */
458   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
459   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
460   if (!row_identity) {
461     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
462     cusparseTriFactors->rpermIndices->assign(r, r+n);
463   }
464   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
465 
466   /* upper triangular indices */
467   ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
468   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
469   if (!col_identity) {
470     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
471     cusparseTriFactors->cpermIndices->assign(c, c+n);
472   }
473 
474   if (!row_identity && !col_identity) {
475     ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr);
476   } else if(!row_identity) {
477     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
478   } else if(!col_identity) {
479     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
480   }
481 
482   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
483   PetscFunctionReturn(0);
484 }
485 
486 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
487 {
488   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
489   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
490   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
491   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
492   cusparseStatus_t                  stat;
493   PetscErrorCode                    ierr;
494   PetscInt                          *AiUp, *AjUp;
495   PetscScalar                       *AAUp;
496   PetscScalar                       *AALo;
497   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
498   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
499   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
500   const MatScalar                   *aa = b->a,*v;
501 
502   PetscFunctionBegin;
503   if (!n) PetscFunctionReturn(0);
504   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
505     try {
506       /* Allocate Space for the upper triangular matrix */
507       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
508       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr);
509       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
510       ierr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
511 
512       /* Fill the upper triangular matrix */
513       AiUp[0]=(PetscInt) 0;
514       AiUp[n]=nzUpper;
515       offset = 0;
516       for (i=0; i<n; i++) {
517         /* set the pointers */
518         v  = aa + ai[i];
519         vj = aj + ai[i];
520         nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
521 
522         /* first, set the diagonal elements */
523         AjUp[offset] = (PetscInt) i;
524         AAUp[offset] = (MatScalar)1.0/v[nz];
525         AiUp[i]      = offset;
526         AALo[offset] = (MatScalar)1.0/v[nz];
527 
528         offset+=1;
529         if (nz>0) {
530           ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
531           ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
532           for (j=offset; j<offset+nz; j++) {
533             AAUp[j] = -AAUp[j];
534             AALo[j] = AAUp[j]/v[nz];
535           }
536           offset+=nz;
537         }
538       }
539 
540       /* allocate space for the triangular factor information */
541       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
542 
543       /* Create the matrix description */
544       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat);
545       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
546       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
547       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
548       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat);
549 
550       /* Create the solve analysis information */
551       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat);
552 
553       /* set the operation */
554       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
555 
556       /* set the matrix */
557       upTriFactor->csrMat = new CsrMatrix;
558       upTriFactor->csrMat->num_rows = A->rmap->n;
559       upTriFactor->csrMat->num_cols = A->cmap->n;
560       upTriFactor->csrMat->num_entries = a->nz;
561 
562       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
563       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
564 
565       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
566       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
567 
568       upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
569       upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
570 
571       /* perform the solve analysis */
572       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
573                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
574                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
575                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat);
576 
577       /* assign the pointer. Is this really necessary? */
578       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
579 
580       /* allocate space for the triangular factor information */
581       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
582 
583       /* Create the matrix description */
584       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat);
585       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
586       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
587       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
588       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat);
589 
590       /* Create the solve analysis information */
591       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat);
592 
593       /* set the operation */
594       loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
595 
596       /* set the matrix */
597       loTriFactor->csrMat = new CsrMatrix;
598       loTriFactor->csrMat->num_rows = A->rmap->n;
599       loTriFactor->csrMat->num_cols = A->cmap->n;
600       loTriFactor->csrMat->num_entries = a->nz;
601 
602       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
603       loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
604 
605       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
606       loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
607 
608       loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
609       loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
610       ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
611 
612       /* perform the solve analysis */
613       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
614                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
615                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
616                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat);
617 
618       /* assign the pointer. Is this really necessary? */
619       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
620 
621       A->offloadmask = PETSC_OFFLOAD_BOTH;
622       ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr);
623       ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr);
624       ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr);
625       ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr);
626     } catch(char *ex) {
627       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
628     }
629   }
630   PetscFunctionReturn(0);
631 }
632 
633 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
634 {
635   PetscErrorCode               ierr;
636   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
637   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
638   IS                           ip = a->row;
639   const PetscInt               *rip;
640   PetscBool                    perm_identity;
641   PetscInt                     n = A->rmap->n;
642 
643   PetscFunctionBegin;
644   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
645   cusparseTriFactors->workVector = new THRUSTARRAY(n);
646   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
647 
648   /* lower triangular indices */
649   ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
650   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
651   if (!perm_identity) {
652     IS             iip;
653     const PetscInt *irip;
654 
655     ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr);
656     ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr);
657     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
658     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
659     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
660     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
661     ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr);
662     ierr = ISDestroy(&iip);CHKERRQ(ierr);
663     ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr);
664   }
665   ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
666   PetscFunctionReturn(0);
667 }
668 
669 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
670 {
671   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
672   IS             isrow = b->row,iscol = b->col;
673   PetscBool      row_identity,col_identity;
674   PetscErrorCode ierr;
675 
676   PetscFunctionBegin;
677   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
678   /* determine which version of MatSolve needs to be used. */
679   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
680   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
681   if (row_identity && col_identity) {
682     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
683     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
684     B->ops->matsolve = NULL;
685     B->ops->matsolvetranspose = NULL;
686   } else {
687     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
688     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
689     B->ops->matsolve = NULL;
690     B->ops->matsolvetranspose = NULL;
691   }
692 
693   /* get the triangular factors */
694   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
695   PetscFunctionReturn(0);
696 }
697 
698 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
699 {
700   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
701   IS             ip = b->row;
702   PetscBool      perm_identity;
703   PetscErrorCode ierr;
704 
705   PetscFunctionBegin;
706   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
707 
708   /* determine which version of MatSolve needs to be used. */
709   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
710   if (perm_identity) {
711     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
712     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
713     B->ops->matsolve = NULL;
714     B->ops->matsolvetranspose = NULL;
715   } else {
716     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
717     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
718     B->ops->matsolve = NULL;
719     B->ops->matsolvetranspose = NULL;
720   }
721 
722   /* get the triangular factors */
723   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
724   PetscFunctionReturn(0);
725 }
726 
727 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
728 {
729   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
730   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
731   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
732   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
733   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
734   cusparseStatus_t                  stat;
735   cusparseIndexBase_t               indexBase;
736   cusparseMatrixType_t              matrixType;
737   cusparseFillMode_t                fillMode;
738   cusparseDiagType_t                diagType;
739 
740   PetscFunctionBegin;
741 
742   /*********************************************/
743   /* Now the Transpose of the Lower Tri Factor */
744   /*********************************************/
745 
746   /* allocate space for the transpose of the lower triangular factor */
747   loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
748 
749   /* set the matrix descriptors of the lower triangular factor */
750   matrixType = cusparseGetMatType(loTriFactor->descr);
751   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
752   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
753     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
754   diagType = cusparseGetMatDiagType(loTriFactor->descr);
755 
756   /* Create the matrix description */
757   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUDA(stat);
758   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUDA(stat);
759   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUDA(stat);
760   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUDA(stat);
761   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUDA(stat);
762 
763   /* Create the solve analysis information */
764   stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUDA(stat);
765 
766   /* set the operation */
767   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
768 
769   /* allocate GPU space for the CSC of the lower triangular factor*/
770   loTriFactorT->csrMat = new CsrMatrix;
771   loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows;
772   loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols;
773   loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
774   loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1);
775   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries);
776   loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries);
777 
778   /* compute the transpose of the lower triangular factor, i.e. the CSC */
779   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
780                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
781                           loTriFactor->csrMat->values->data().get(),
782                           loTriFactor->csrMat->row_offsets->data().get(),
783                           loTriFactor->csrMat->column_indices->data().get(),
784                           loTriFactorT->csrMat->values->data().get(),
785                           loTriFactorT->csrMat->column_indices->data().get(),
786                           loTriFactorT->csrMat->row_offsets->data().get(),
787                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
788 
789   /* perform the solve analysis on the transposed matrix */
790   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
791                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries,
792                            loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
793                            loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(),
794                            loTriFactorT->solveInfo);CHKERRCUDA(stat);
795 
796   /* assign the pointer. Is this really necessary? */
797   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
798 
799   /*********************************************/
800   /* Now the Transpose of the Upper Tri Factor */
801   /*********************************************/
802 
803   /* allocate space for the transpose of the upper triangular factor */
804   upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
805 
806   /* set the matrix descriptors of the upper triangular factor */
807   matrixType = cusparseGetMatType(upTriFactor->descr);
808   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
809   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
810     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
811   diagType = cusparseGetMatDiagType(upTriFactor->descr);
812 
813   /* Create the matrix description */
814   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUDA(stat);
815   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUDA(stat);
816   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUDA(stat);
817   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUDA(stat);
818   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUDA(stat);
819 
820   /* Create the solve analysis information */
821   stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUDA(stat);
822 
823   /* set the operation */
824   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
825 
826   /* allocate GPU space for the CSC of the upper triangular factor*/
827   upTriFactorT->csrMat = new CsrMatrix;
828   upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows;
829   upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols;
830   upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
831   upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1);
832   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries);
833   upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries);
834 
835   /* compute the transpose of the upper triangular factor, i.e. the CSC */
836   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
837                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
838                           upTriFactor->csrMat->values->data().get(),
839                           upTriFactor->csrMat->row_offsets->data().get(),
840                           upTriFactor->csrMat->column_indices->data().get(),
841                           upTriFactorT->csrMat->values->data().get(),
842                           upTriFactorT->csrMat->column_indices->data().get(),
843                           upTriFactorT->csrMat->row_offsets->data().get(),
844                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
845 
846   /* perform the solve analysis on the transposed matrix */
847   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
848                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries,
849                            upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
850                            upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(),
851                            upTriFactorT->solveInfo);CHKERRCUDA(stat);
852 
853   /* assign the pointer. Is this really necessary? */
854   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
855   PetscFunctionReturn(0);
856 }
857 
858 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
859 {
860   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
861   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
862   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
863   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
864   cusparseStatus_t             stat;
865   cusparseIndexBase_t          indexBase;
866   cudaError_t                  err;
867   PetscErrorCode               ierr;
868 
869   PetscFunctionBegin;
870 
871   /* allocate space for the triangular factor information */
872   matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
873   stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUDA(stat);
874   indexBase = cusparseGetMatIndexBase(matstruct->descr);
875   stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUDA(stat);
876   stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat);
877 
878   /* set alpha and beta */
879   err = cudaMalloc((void **)&(matstructT->alpha),    sizeof(PetscScalar));CHKERRCUDA(err);
880   err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
881   err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
882   err = cudaMemcpy(matstructT->alpha,    &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
883   err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
884   err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
885   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
886 
887   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
888     CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
889     CsrMatrix *matrixT= new CsrMatrix;
890     matrixT->num_rows = A->cmap->n;
891     matrixT->num_cols = A->rmap->n;
892     matrixT->num_entries = a->nz;
893     matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
894     matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
895     matrixT->values = new THRUSTARRAY(a->nz);
896 
897     cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
898     cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
899     /* compute the transpose, i.e. the CSC */
900     indexBase = cusparseGetMatIndexBase(matstruct->descr);
901     stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
902                             A->cmap->n, matrix->num_entries,
903                             matrix->values->data().get(),
904                             cusparsestruct->rowoffsets_gpu->data().get(),
905                             matrix->column_indices->data().get(),
906                             matrixT->values->data().get(),
907                             matrixT->column_indices->data().get(),
908                             matrixT->row_offsets->data().get(),
909                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
910     /* assign the pointer */
911     matstructT->mat = matrixT;
912     ierr = PetscLogCpuToGpu(((A->rmap->n+1)+(a->nz))*sizeof(int)+(3+a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
913   } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
914     /* First convert HYB to CSR */
915     CsrMatrix *temp= new CsrMatrix;
916     temp->num_rows = A->rmap->n;
917     temp->num_cols = A->cmap->n;
918     temp->num_entries = a->nz;
919     temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
920     temp->column_indices = new THRUSTINTARRAY32(a->nz);
921     temp->values = new THRUSTARRAY(a->nz);
922 
923 
924     stat = cusparse_hyb2csr(cusparsestruct->handle,
925                             matstruct->descr, (cusparseHybMat_t)matstruct->mat,
926                             temp->values->data().get(),
927                             temp->row_offsets->data().get(),
928                             temp->column_indices->data().get());CHKERRCUDA(stat);
929 
930     /* Next, convert CSR to CSC (i.e. the matrix transpose) */
931     CsrMatrix *tempT= new CsrMatrix;
932     tempT->num_rows = A->rmap->n;
933     tempT->num_cols = A->cmap->n;
934     tempT->num_entries = a->nz;
935     tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
936     tempT->column_indices = new THRUSTINTARRAY32(a->nz);
937     tempT->values = new THRUSTARRAY(a->nz);
938 
939     stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
940                             temp->num_cols, temp->num_entries,
941                             temp->values->data().get(),
942                             temp->row_offsets->data().get(),
943                             temp->column_indices->data().get(),
944                             tempT->values->data().get(),
945                             tempT->column_indices->data().get(),
946                             tempT->row_offsets->data().get(),
947                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
948 
949     /* Last, convert CSC to HYB */
950     cusparseHybMat_t hybMat;
951     stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat);
952     cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
953       CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
954     stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
955                             matstructT->descr, tempT->values->data().get(),
956                             tempT->row_offsets->data().get(),
957                             tempT->column_indices->data().get(),
958                             hybMat, 0, partition);CHKERRCUDA(stat);
959 
960     /* assign the pointer */
961     matstructT->mat = hybMat;
962     ierr = PetscLogCpuToGpu((2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)))+3*sizeof(PetscScalar));CHKERRQ(ierr);
963 
964     /* delete temporaries */
965     if (tempT) {
966       if (tempT->values) delete (THRUSTARRAY*) tempT->values;
967       if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
968       if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
969       delete (CsrMatrix*) tempT;
970     }
971     if (temp) {
972       if (temp->values) delete (THRUSTARRAY*) temp->values;
973       if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
974       if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
975       delete (CsrMatrix*) temp;
976     }
977   }
978   /* assign the compressed row indices */
979   matstructT->cprowIndices = new THRUSTINTARRAY;
980   matstructT->cprowIndices->resize(A->cmap->n);
981   thrust::sequence(matstructT->cprowIndices->begin(), matstructT->cprowIndices->end());
982   /* assign the pointer */
983   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
984   PetscFunctionReturn(0);
985 }
986 
987 /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
988 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
989 {
990   PetscInt                              n = xx->map->n;
991   const PetscScalar                     *barray;
992   PetscScalar                           *xarray;
993   thrust::device_ptr<const PetscScalar> bGPU;
994   thrust::device_ptr<PetscScalar>       xGPU;
995   cusparseStatus_t                      stat;
996   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
997   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
998   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
999   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1000   PetscErrorCode                        ierr;
1001 
1002   PetscFunctionBegin;
1003   /* Analyze the matrix and create the transpose ... on the fly */
1004   if (!loTriFactorT && !upTriFactorT) {
1005     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1006     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1007     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1008   }
1009 
1010   /* Get the GPU pointers */
1011   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1012   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1013   xGPU = thrust::device_pointer_cast(xarray);
1014   bGPU = thrust::device_pointer_cast(barray);
1015 
1016   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1017   /* First, reorder with the row permutation */
1018   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1019                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1020                xGPU);
1021 
1022   /* First, solve U */
1023   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1024                         upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1025                         upTriFactorT->csrMat->values->data().get(),
1026                         upTriFactorT->csrMat->row_offsets->data().get(),
1027                         upTriFactorT->csrMat->column_indices->data().get(),
1028                         upTriFactorT->solveInfo,
1029                         xarray, tempGPU->data().get());CHKERRCUDA(stat);
1030 
1031   /* Then, solve L */
1032   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1033                         loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1034                         loTriFactorT->csrMat->values->data().get(),
1035                         loTriFactorT->csrMat->row_offsets->data().get(),
1036                         loTriFactorT->csrMat->column_indices->data().get(),
1037                         loTriFactorT->solveInfo,
1038                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1039 
1040   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1041   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1042                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1043                tempGPU->begin());
1044 
1045   /* Copy the temporary to the full solution. */
1046   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1047 
1048   /* restore */
1049   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1050   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1051   ierr = WaitForGPU();CHKERRCUDA(ierr);
1052   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1053   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1054   PetscFunctionReturn(0);
1055 }
1056 
1057 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1058 {
1059   const PetscScalar                 *barray;
1060   PetscScalar                       *xarray;
1061   cusparseStatus_t                  stat;
1062   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1063   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1064   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1065   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1066   PetscErrorCode                    ierr;
1067 
1068   PetscFunctionBegin;
1069   /* Analyze the matrix and create the transpose ... on the fly */
1070   if (!loTriFactorT && !upTriFactorT) {
1071     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1072     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1073     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1074   }
1075 
1076   /* Get the GPU pointers */
1077   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1078   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1079 
1080   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1081   /* First, solve U */
1082   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1083                         upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1084                         upTriFactorT->csrMat->values->data().get(),
1085                         upTriFactorT->csrMat->row_offsets->data().get(),
1086                         upTriFactorT->csrMat->column_indices->data().get(),
1087                         upTriFactorT->solveInfo,
1088                         barray, tempGPU->data().get());CHKERRCUDA(stat);
1089 
1090   /* Then, solve L */
1091   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1092                         loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1093                         loTriFactorT->csrMat->values->data().get(),
1094                         loTriFactorT->csrMat->row_offsets->data().get(),
1095                         loTriFactorT->csrMat->column_indices->data().get(),
1096                         loTriFactorT->solveInfo,
1097                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1098 
1099   /* restore */
1100   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1101   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1102   ierr = WaitForGPU();CHKERRCUDA(ierr);
1103   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1104   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1105   PetscFunctionReturn(0);
1106 }
1107 
1108 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1109 {
1110   const PetscScalar                     *barray;
1111   PetscScalar                           *xarray;
1112   thrust::device_ptr<const PetscScalar> bGPU;
1113   thrust::device_ptr<PetscScalar>       xGPU;
1114   cusparseStatus_t                      stat;
1115   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1116   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1117   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1118   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1119   PetscErrorCode                        ierr;
1120 
1121   PetscFunctionBegin;
1122 
1123   /* Get the GPU pointers */
1124   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1125   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1126   xGPU = thrust::device_pointer_cast(xarray);
1127   bGPU = thrust::device_pointer_cast(barray);
1128 
1129   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1130   /* First, reorder with the row permutation */
1131   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1132                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1133                tempGPU->begin());
1134 
1135   /* Next, solve L */
1136   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1137                         loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1138                         loTriFactor->csrMat->values->data().get(),
1139                         loTriFactor->csrMat->row_offsets->data().get(),
1140                         loTriFactor->csrMat->column_indices->data().get(),
1141                         loTriFactor->solveInfo,
1142                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1143 
1144   /* Then, solve U */
1145   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1146                         upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1147                         upTriFactor->csrMat->values->data().get(),
1148                         upTriFactor->csrMat->row_offsets->data().get(),
1149                         upTriFactor->csrMat->column_indices->data().get(),
1150                         upTriFactor->solveInfo,
1151                         xarray, tempGPU->data().get());CHKERRCUDA(stat);
1152 
1153   /* Last, reorder with the column permutation */
1154   thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1155                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
1156                xGPU);
1157 
1158   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1159   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1160   ierr = WaitForGPU();CHKERRCUDA(ierr);
1161   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1162   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1163   PetscFunctionReturn(0);
1164 }
1165 
1166 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1167 {
1168   const PetscScalar                 *barray;
1169   PetscScalar                       *xarray;
1170   cusparseStatus_t                  stat;
1171   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1172   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1173   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1174   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1175   PetscErrorCode                    ierr;
1176 
1177   PetscFunctionBegin;
1178   /* Get the GPU pointers */
1179   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1180   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1181 
1182   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1183   /* First, solve L */
1184   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1185                         loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1186                         loTriFactor->csrMat->values->data().get(),
1187                         loTriFactor->csrMat->row_offsets->data().get(),
1188                         loTriFactor->csrMat->column_indices->data().get(),
1189                         loTriFactor->solveInfo,
1190                         barray, tempGPU->data().get());CHKERRCUDA(stat);
1191 
1192   /* Next, solve U */
1193   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1194                         upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1195                         upTriFactor->csrMat->values->data().get(),
1196                         upTriFactor->csrMat->row_offsets->data().get(),
1197                         upTriFactor->csrMat->column_indices->data().get(),
1198                         upTriFactor->solveInfo,
1199                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1200 
1201   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1202   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1203   ierr = WaitForGPU();CHKERRCUDA(ierr);
1204   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1205   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1206   PetscFunctionReturn(0);
1207 }
1208 
1209 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1210 {
1211   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1212   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
1213   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1214   PetscInt                     m = A->rmap->n,*ii,*ridx;
1215   PetscErrorCode               ierr;
1216   cusparseStatus_t             stat;
1217   cudaError_t                  err;
1218 
1219   PetscFunctionBegin;
1220   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1221     ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1222     if (A->was_assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) {
1223       /* Copy values only */
1224       CsrMatrix *mat,*matT;
1225       mat  = (CsrMatrix*)cusparsestruct->mat->mat;
1226       mat->values->assign(a->a, a->a+a->nz);
1227       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
1228 
1229       /* Update matT when it was built before */
1230       if (cusparsestruct->matTranspose) {
1231         cusparseIndexBase_t indexBase = cusparseGetMatIndexBase(cusparsestruct->mat->descr);
1232         matT = (CsrMatrix*)cusparsestruct->matTranspose->mat;
1233         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1234                                  A->cmap->n, mat->num_entries,
1235                                  mat->values->data().get(),
1236                                  cusparsestruct->rowoffsets_gpu->data().get(),
1237                                  mat->column_indices->data().get(),
1238                                  matT->values->data().get(),
1239                                  matT->column_indices->data().get(),
1240                                  matT->row_offsets->data().get(),
1241                                  CUSPARSE_ACTION_NUMERIC,indexBase);CHKERRCUDA(stat);
1242       }
1243     } else {
1244       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr);
1245       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->matTranspose,cusparsestruct->format);CHKERRQ(ierr);
1246       delete cusparsestruct->workVector;
1247       delete cusparsestruct->rowoffsets_gpu;
1248       try {
1249         cusparsestruct->nonzerorow=0;
1250         for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0);
1251 
1252         if (a->compressedrow.use) {
1253           m    = a->compressedrow.nrows;
1254           ii   = a->compressedrow.i;
1255           ridx = a->compressedrow.rindex;
1256         } else {
1257           /* Forcing compressed row on the GPU */
1258           int k=0;
1259           ierr = PetscMalloc1(cusparsestruct->nonzerorow+1, &ii);CHKERRQ(ierr);
1260           ierr = PetscMalloc1(cusparsestruct->nonzerorow, &ridx);CHKERRQ(ierr);
1261           ii[0]=0;
1262           for (int j = 0; j<m; j++) {
1263             if ((a->i[j+1]-a->i[j])>0) {
1264               ii[k]  = a->i[j];
1265               ridx[k]= j;
1266               k++;
1267             }
1268           }
1269           ii[cusparsestruct->nonzerorow] = a->nz;
1270           m = cusparsestruct->nonzerorow;
1271         }
1272 
1273         /* allocate space for the triangular factor information */
1274         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1275         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUDA(stat);
1276         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
1277         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat);
1278 
1279         err = cudaMalloc((void **)&(matstruct->alpha),    sizeof(PetscScalar));CHKERRCUDA(err);
1280         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1281         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1282         err = cudaMemcpy(matstruct->alpha,    &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1283         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1284         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1285         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
1286 
1287         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1288         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1289           /* set the matrix */
1290           CsrMatrix *matrix= new CsrMatrix;
1291           matrix->num_rows = m;
1292           matrix->num_cols = A->cmap->n;
1293           matrix->num_entries = a->nz;
1294           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1295           matrix->row_offsets->assign(ii, ii + m+1);
1296 
1297           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1298           matrix->column_indices->assign(a->j, a->j+a->nz);
1299 
1300           matrix->values = new THRUSTARRAY(a->nz);
1301           matrix->values->assign(a->a, a->a+a->nz);
1302 
1303           /* assign the pointer */
1304           matstruct->mat = matrix;
1305 
1306         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1307           CsrMatrix *matrix= new CsrMatrix;
1308           matrix->num_rows = m;
1309           matrix->num_cols = A->cmap->n;
1310           matrix->num_entries = a->nz;
1311           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1312           matrix->row_offsets->assign(ii, ii + m+1);
1313 
1314           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1315           matrix->column_indices->assign(a->j, a->j+a->nz);
1316 
1317           matrix->values = new THRUSTARRAY(a->nz);
1318           matrix->values->assign(a->a, a->a+a->nz);
1319 
1320           cusparseHybMat_t hybMat;
1321           stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat);
1322           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1323             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1324           stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols,
1325               matstruct->descr, matrix->values->data().get(),
1326               matrix->row_offsets->data().get(),
1327               matrix->column_indices->data().get(),
1328               hybMat, 0, partition);CHKERRCUDA(stat);
1329           /* assign the pointer */
1330           matstruct->mat = hybMat;
1331 
1332           if (matrix) {
1333             if (matrix->values) delete (THRUSTARRAY*)matrix->values;
1334             if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices;
1335             if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets;
1336             delete (CsrMatrix*)matrix;
1337           }
1338         }
1339 
1340         /* assign the compressed row indices */
1341         matstruct->cprowIndices = new THRUSTINTARRAY(m);
1342         matstruct->cprowIndices->assign(ridx,ridx+m);
1343         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+m*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
1344 
1345         /* assign the pointer */
1346         cusparsestruct->mat = matstruct;
1347 
1348         if (!a->compressedrow.use) {
1349           ierr = PetscFree(ii);CHKERRQ(ierr);
1350           ierr = PetscFree(ridx);CHKERRQ(ierr);
1351         }
1352         cusparsestruct->workVector = new THRUSTARRAY(m);
1353       } catch(char *ex) {
1354         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1355       }
1356       cusparsestruct->nonzerostate = A->nonzerostate;
1357     }
1358     ierr = WaitForGPU();CHKERRCUDA(ierr);
1359     A->offloadmask = PETSC_OFFLOAD_BOTH;
1360     ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1361   }
1362   PetscFunctionReturn(0);
1363 }
1364 
1365 struct VecCUDAPlusEquals
1366 {
1367   template <typename Tuple>
1368   __host__ __device__
1369   void operator()(Tuple t)
1370   {
1371     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1372   }
1373 };
1374 
1375 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1376 {
1377   PetscErrorCode ierr;
1378 
1379   PetscFunctionBegin;
1380   ierr = MatMultAdd_SeqAIJCUSPARSE(A,xx,NULL,yy);CHKERRQ(ierr);
1381   PetscFunctionReturn(0);
1382 }
1383 
1384 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1385 {
1386   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1387   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1388   Mat_SeqAIJCUSPARSEMultStruct *matstructT;
1389   const PetscScalar            *xarray;
1390   PetscScalar                  *yarray;
1391   PetscErrorCode               ierr;
1392   cusparseStatus_t             stat;
1393 
1394   PetscFunctionBegin;
1395   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1396   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1397   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1398   if (!matstructT) {
1399     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1400     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1401   }
1402   ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1403   ierr = VecCUDAGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
1404   if (yy->map->n) {
1405     PetscInt                     n = yy->map->n;
1406     cudaError_t                  err;
1407     err = cudaMemset(yarray,0,n*sizeof(PetscScalar));CHKERRCUDA(err); /* hack to fix numerical errors from reading output vector yy, apparently */
1408   }
1409 
1410   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1411   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1412     CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1413     stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1414                              mat->num_rows, mat->num_cols,
1415                              mat->num_entries, matstructT->alpha, matstructT->descr,
1416                              mat->values->data().get(), mat->row_offsets->data().get(),
1417                              mat->column_indices->data().get(), xarray, matstructT->beta_zero,
1418                              yarray);CHKERRCUDA(stat);
1419   } else {
1420     cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1421     stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1422                              matstructT->alpha, matstructT->descr, hybMat,
1423                              xarray, matstructT->beta_zero,
1424                              yarray);CHKERRCUDA(stat);
1425   }
1426   ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1427   ierr = VecCUDARestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
1428   ierr = WaitForGPU();CHKERRCUDA(ierr);
1429   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1430   ierr = PetscLogGpuFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr);
1431   PetscFunctionReturn(0);
1432 }
1433 
1434 
1435 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1436 {
1437   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1438   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1439   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
1440   const PetscScalar            *xarray;
1441   PetscScalar                  *zarray,*dptr,*beta;
1442   PetscErrorCode               ierr;
1443   cusparseStatus_t             stat;
1444   PetscBool                    cmpr; /* if the matrix has been compressed (zero rows) */
1445 
1446   PetscFunctionBegin;
1447   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1448   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1449   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1450   try {
1451     cmpr = (PetscBool)(cusparsestruct->workVector->size() == (thrust::detail::vector_base<PetscScalar, thrust::device_malloc_allocator<PetscScalar> >::size_type)(A->rmap->n));
1452     ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1453     if (yy && !cmpr) { /* MatMultAdd with noncompressed storage -> need uptodate zz vector */
1454       ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);
1455     } else {
1456       ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
1457     }
1458     dptr = cmpr ? zarray : cusparsestruct->workVector->data().get();
1459     beta = (yy == zz && dptr == zarray) ? matstruct->beta_one : matstruct->beta_zero;
1460 
1461     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1462     /* csr_spmv is multiply add */
1463     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1464       /* here we need to be careful to set the number of rows in the multiply to the
1465          number of compressed rows in the matrix ... which is equivalent to the
1466          size of the workVector */
1467       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1468       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1469                                mat->num_rows, mat->num_cols,
1470                                mat->num_entries, matstruct->alpha, matstruct->descr,
1471                                mat->values->data().get(), mat->row_offsets->data().get(),
1472                                mat->column_indices->data().get(), xarray, beta,
1473                                dptr);CHKERRCUDA(stat);
1474     } else {
1475       if (cusparsestruct->workVector->size()) {
1476         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1477         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1478                                  matstruct->alpha, matstruct->descr, hybMat,
1479                                  xarray, beta,
1480                                  dptr);CHKERRCUDA(stat);
1481       }
1482     }
1483     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1484 
1485     if (yy) {
1486       if (dptr != zarray) {
1487         ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);
1488       } else if (zz != yy) {
1489         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr);
1490       }
1491     } else if (dptr != zarray) {
1492       ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
1493     }
1494     /* scatter the data from the temporary into the full vector with a += operation */
1495     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1496     if (dptr != zarray) {
1497       thrust::device_ptr<PetscScalar> zptr;
1498 
1499       zptr = thrust::device_pointer_cast(zarray);
1500       thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
1501                        thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(),
1502                        VecCUDAPlusEquals());
1503     }
1504     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1505     ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1506     if (yy && !cmpr) {
1507       ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);
1508     } else {
1509       ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
1510     }
1511   } catch(char *ex) {
1512     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1513   }
1514   ierr = WaitForGPU();CHKERRCUDA(ierr);
1515   ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
1516   PetscFunctionReturn(0);
1517 }
1518 
1519 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1520 {
1521   Mat_SeqAIJ                      *a = (Mat_SeqAIJ*)A->data;
1522   Mat_SeqAIJCUSPARSE              *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1523   Mat_SeqAIJCUSPARSEMultStruct    *matstructT;
1524   const PetscScalar               *xarray;
1525   PetscScalar                     *zarray,*dptr,*beta;
1526   PetscErrorCode                  ierr;
1527   cusparseStatus_t                stat;
1528 
1529   PetscFunctionBegin;
1530   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1531   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1532   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1533   if (!matstructT) {
1534     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1535     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1536   }
1537 
1538   try {
1539     ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);
1540     ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1541     ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);
1542     dptr = cusparsestruct->workVector->size() == (thrust::detail::vector_base<PetscScalar, thrust::device_malloc_allocator<PetscScalar> >::size_type)(A->cmap->n) ? zarray : cusparsestruct->workVector->data().get();
1543     beta = (yy == zz && dptr == zarray) ? matstructT->beta_one : matstructT->beta_zero;
1544 
1545     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1546     /* multiply add with matrix transpose */
1547     if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1548       CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1549       /* here we need to be careful to set the number of rows in the multiply to the
1550          number of compressed rows in the matrix ... which is equivalent to the
1551          size of the workVector */
1552       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1553                                mat->num_rows, mat->num_cols,
1554                                mat->num_entries, matstructT->alpha, matstructT->descr,
1555                                mat->values->data().get(), mat->row_offsets->data().get(),
1556                                mat->column_indices->data().get(), xarray, beta,
1557                                dptr);CHKERRCUDA(stat);
1558     } else {
1559       cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1560       if (cusparsestruct->workVector->size()) {
1561         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1562                                  matstructT->alpha, matstructT->descr, hybMat,
1563                                  xarray, beta,
1564                                  dptr);CHKERRCUDA(stat);
1565       }
1566     }
1567     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1568 
1569     if (dptr != zarray) {
1570       ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);
1571     } else if (zz != yy) {
1572       ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr);
1573     }
1574     /* scatter the data from the temporary into the full vector with a += operation */
1575     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1576     if (dptr != zarray) {
1577       thrust::device_ptr<PetscScalar> zptr;
1578 
1579       zptr = thrust::device_pointer_cast(zarray);
1580 
1581       /* scatter the data from the temporary into the full vector with a += operation */
1582       thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))),
1583                        thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))) + A->cmap->n,
1584                        VecCUDAPlusEquals());
1585     }
1586     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1587     ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1588     ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);
1589   } catch(char *ex) {
1590     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1591   }
1592   ierr = WaitForGPU();CHKERRCUDA(ierr); /* is this needed? just for yy==0 in Mult */
1593   ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
1594   PetscFunctionReturn(0);
1595 }
1596 
1597 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
1598 {
1599   PetscErrorCode ierr;
1600 
1601   PetscFunctionBegin;
1602   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
1603   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
1604   if (A->factortype == MAT_FACTOR_NONE) {
1605     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1606   }
1607   A->ops->mult             = MatMult_SeqAIJCUSPARSE;
1608   A->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1609   A->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1610   A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1611   PetscFunctionReturn(0);
1612 }
1613 
1614 /* --------------------------------------------------------------------------------*/
1615 /*@
1616    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
1617    (the default parallel PETSc format). This matrix will ultimately pushed down
1618    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
1619    assembly performance the user should preallocate the matrix storage by setting
1620    the parameter nz (or the array nnz).  By setting these parameters accurately,
1621    performance during matrix assembly can be increased by more than a factor of 50.
1622 
1623    Collective
1624 
1625    Input Parameters:
1626 +  comm - MPI communicator, set to PETSC_COMM_SELF
1627 .  m - number of rows
1628 .  n - number of columns
1629 .  nz - number of nonzeros per row (same for all rows)
1630 -  nnz - array containing the number of nonzeros in the various rows
1631          (possibly different for each row) or NULL
1632 
1633    Output Parameter:
1634 .  A - the matrix
1635 
1636    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1637    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1638    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
1639 
1640    Notes:
1641    If nnz is given then nz is ignored
1642 
1643    The AIJ format (also called the Yale sparse matrix format or
1644    compressed row storage), is fully compatible with standard Fortran 77
1645    storage.  That is, the stored row and column indices can begin at
1646    either one (as in Fortran) or zero.  See the users' manual for details.
1647 
1648    Specify the preallocated storage with either nz or nnz (not both).
1649    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1650    allocation.  For large problems you MUST preallocate memory or you
1651    will get TERRIBLE performance, see the users' manual chapter on matrices.
1652 
1653    By default, this format uses inodes (identical nodes) when possible, to
1654    improve numerical efficiency of matrix-vector products and solves. We
1655    search for consecutive rows with the same nonzero structure, thereby
1656    reusing matrix information to achieve increased efficiency.
1657 
1658    Level: intermediate
1659 
1660 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
1661 @*/
1662 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1663 {
1664   PetscErrorCode ierr;
1665 
1666   PetscFunctionBegin;
1667   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1668   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
1669   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1670   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
1671   PetscFunctionReturn(0);
1672 }
1673 
1674 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
1675 {
1676   PetscErrorCode   ierr;
1677 
1678   PetscFunctionBegin;
1679   if (A->factortype==MAT_FACTOR_NONE) {
1680     if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) {
1681       ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
1682     }
1683   } else {
1684     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
1685   }
1686   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
1687   PetscFunctionReturn(0);
1688 }
1689 
1690 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
1691 {
1692   PetscErrorCode ierr;
1693   Mat C;
1694   cusparseStatus_t stat;
1695   cusparseHandle_t handle=0;
1696 
1697   PetscFunctionBegin;
1698   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
1699   C    = *B;
1700   ierr = PetscFree(C->defaultvectype);CHKERRQ(ierr);
1701   ierr = PetscStrallocpy(VECCUDA,&C->defaultvectype);CHKERRQ(ierr);
1702 
1703   /* inject CUSPARSE-specific stuff */
1704   if (C->factortype==MAT_FACTOR_NONE) {
1705     /* you cannot check the inode.use flag here since the matrix was just created.
1706        now build a GPU matrix data structure */
1707     C->spptr = new Mat_SeqAIJCUSPARSE;
1708     ((Mat_SeqAIJCUSPARSE*)C->spptr)->mat            = 0;
1709     ((Mat_SeqAIJCUSPARSE*)C->spptr)->matTranspose   = 0;
1710     ((Mat_SeqAIJCUSPARSE*)C->spptr)->workVector     = 0;
1711     ((Mat_SeqAIJCUSPARSE*)C->spptr)->rowoffsets_gpu = 0;
1712     ((Mat_SeqAIJCUSPARSE*)C->spptr)->format         = MAT_CUSPARSE_CSR;
1713     ((Mat_SeqAIJCUSPARSE*)C->spptr)->stream         = 0;
1714     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1715     ((Mat_SeqAIJCUSPARSE*)C->spptr)->handle         = handle;
1716     ((Mat_SeqAIJCUSPARSE*)C->spptr)->nonzerostate   = 0;
1717   } else {
1718     /* NEXT, set the pointers to the triangular factors */
1719     C->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1720     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtr          = 0;
1721     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtr          = 0;
1722     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtrTranspose = 0;
1723     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtrTranspose = 0;
1724     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->rpermIndices            = 0;
1725     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->cpermIndices            = 0;
1726     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->workVector              = 0;
1727     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle                  = 0;
1728     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1729     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle                  = handle;
1730     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->nnz                     = 0;
1731   }
1732 
1733   C->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
1734   C->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
1735   C->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
1736   C->ops->mult             = MatMult_SeqAIJCUSPARSE;
1737   C->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1738   C->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1739   C->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1740   C->ops->duplicate        = MatDuplicate_SeqAIJCUSPARSE;
1741 
1742   ierr = PetscObjectChangeTypeName((PetscObject)C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1743 
1744   C->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
1745 
1746   ierr = PetscObjectComposeFunction((PetscObject)C, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
1747   PetscFunctionReturn(0);
1748 }
1749 
1750 PETSC_EXTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat B)
1751 {
1752   PetscErrorCode ierr;
1753   cusparseStatus_t stat;
1754   cusparseHandle_t handle=0;
1755 
1756   PetscFunctionBegin;
1757   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
1758   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
1759 
1760   if (B->factortype==MAT_FACTOR_NONE) {
1761     /* you cannot check the inode.use flag here since the matrix was just created.
1762        now build a GPU matrix data structure */
1763     B->spptr = new Mat_SeqAIJCUSPARSE;
1764     ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat            = 0;
1765     ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose   = 0;
1766     ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector     = 0;
1767     ((Mat_SeqAIJCUSPARSE*)B->spptr)->rowoffsets_gpu = 0;
1768     ((Mat_SeqAIJCUSPARSE*)B->spptr)->format         = MAT_CUSPARSE_CSR;
1769     ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream         = 0;
1770     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1771     ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle         = handle;
1772     ((Mat_SeqAIJCUSPARSE*)B->spptr)->nonzerostate   = 0;
1773   } else {
1774     /* NEXT, set the pointers to the triangular factors */
1775     B->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1776     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr          = 0;
1777     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr          = 0;
1778     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0;
1779     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0;
1780     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices            = 0;
1781     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices            = 0;
1782     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector              = 0;
1783     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1784     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle                  = handle;
1785     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz                     = 0;
1786   }
1787 
1788   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
1789   B->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
1790   B->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
1791   B->ops->mult             = MatMult_SeqAIJCUSPARSE;
1792   B->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1793   B->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1794   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1795   B->ops->duplicate        = MatDuplicate_SeqAIJCUSPARSE;
1796 
1797   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1798 
1799   B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
1800 
1801   ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
1802   PetscFunctionReturn(0);
1803 }
1804 
1805 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
1806 {
1807   PetscErrorCode ierr;
1808 
1809   PetscFunctionBegin;
1810   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
1811   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B);CHKERRQ(ierr);
1812   PetscFunctionReturn(0);
1813 }
1814 
1815 /*MC
1816    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
1817 
1818    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
1819    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
1820    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
1821 
1822    Options Database Keys:
1823 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
1824 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1825 -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1826 
1827   Level: beginner
1828 
1829 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
1830 M*/
1831 
1832 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);
1833 
1834 
1835 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
1836 {
1837   PetscErrorCode ierr;
1838 
1839   PetscFunctionBegin;
1840   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1841   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1842   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1843   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1844   PetscFunctionReturn(0);
1845 }
1846 
1847 
1848 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
1849 {
1850   cusparseStatus_t stat;
1851   cusparseHandle_t handle;
1852 
1853   PetscFunctionBegin;
1854   if (*cusparsestruct) {
1855     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);
1856     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);
1857     delete (*cusparsestruct)->workVector;
1858     delete (*cusparsestruct)->rowoffsets_gpu;
1859     if (handle = (*cusparsestruct)->handle) {
1860       stat = cusparseDestroy(handle);CHKERRCUDA(stat);
1861     }
1862     delete *cusparsestruct;
1863     *cusparsestruct = 0;
1864   }
1865   PetscFunctionReturn(0);
1866 }
1867 
1868 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
1869 {
1870   PetscFunctionBegin;
1871   if (*mat) {
1872     delete (*mat)->values;
1873     delete (*mat)->column_indices;
1874     delete (*mat)->row_offsets;
1875     delete *mat;
1876     *mat = 0;
1877   }
1878   PetscFunctionReturn(0);
1879 }
1880 
1881 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
1882 {
1883   cusparseStatus_t stat;
1884   PetscErrorCode   ierr;
1885 
1886   PetscFunctionBegin;
1887   if (*trifactor) {
1888     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUDA(stat); }
1889     if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUDA(stat); }
1890     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
1891     delete *trifactor;
1892     *trifactor = 0;
1893   }
1894   PetscFunctionReturn(0);
1895 }
1896 
1897 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
1898 {
1899   CsrMatrix        *mat;
1900   cusparseStatus_t stat;
1901   cudaError_t      err;
1902 
1903   PetscFunctionBegin;
1904   if (*matstruct) {
1905     if ((*matstruct)->mat) {
1906       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
1907         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
1908         stat = cusparseDestroyHybMat(hybMat);CHKERRCUDA(stat);
1909       } else {
1910         mat = (CsrMatrix*)(*matstruct)->mat;
1911         CsrMatrix_Destroy(&mat);
1912       }
1913     }
1914     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUDA(stat); }
1915     delete (*matstruct)->cprowIndices;
1916     if ((*matstruct)->alpha)     { err=cudaFree((*matstruct)->alpha);CHKERRCUDA(err); }
1917     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
1918     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
1919     delete *matstruct;
1920     *matstruct = 0;
1921   }
1922   PetscFunctionReturn(0);
1923 }
1924 
1925 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
1926 {
1927   cusparseHandle_t handle;
1928   cusparseStatus_t stat;
1929 
1930   PetscFunctionBegin;
1931   if (*trifactors) {
1932     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);
1933     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);
1934     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);
1935     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);
1936     delete (*trifactors)->rpermIndices;
1937     delete (*trifactors)->cpermIndices;
1938     delete (*trifactors)->workVector;
1939     if (handle = (*trifactors)->handle) {
1940       stat = cusparseDestroy(handle);CHKERRCUDA(stat);
1941     }
1942     delete *trifactors;
1943     *trifactors = 0;
1944   }
1945   PetscFunctionReturn(0);
1946 }
1947 
1948