xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 4e4bbfaa3814cc83b5851d85be69070845f5653e) !
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 #define PETSC_SKIP_CXX_COMPLEX_FIX
7 
8 #include <petscconf.h>
9 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
10 #include <../src/mat/impls/sbaij/seq/sbaij.h>
11 #include <../src/vec/vec/impls/dvecimpl.h>
12 #include <petsc/private/vecimpl.h>
13 #undef VecType
14 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
15 
16 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
17 
18 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
19 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
20 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
21 
22 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
23 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
24 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
25 
26 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
27 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
28 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
29 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
30 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
31 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
32 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
33 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
34 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
35 
36 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
37 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
38 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
39 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
40 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
41 
42 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
43 {
44   cusparseStatus_t   stat;
45   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
46 
47   PetscFunctionBegin;
48   cusparsestruct->stream = stream;
49   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUDA(stat);
50   PetscFunctionReturn(0);
51 }
52 
53 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
54 {
55   cusparseStatus_t   stat;
56   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
57 
58   PetscFunctionBegin;
59   if (cusparsestruct->handle != handle) {
60     if (cusparsestruct->handle) {
61       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUDA(stat);
62     }
63     cusparsestruct->handle = handle;
64   }
65   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
66   PetscFunctionReturn(0);
67 }
68 
69 PetscErrorCode MatCUSPARSEClearHandle(Mat A)
70 {
71   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
72   PetscFunctionBegin;
73   if (cusparsestruct->handle)
74     cusparsestruct->handle = 0;
75   PetscFunctionReturn(0);
76 }
77 
78 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
79 {
80   PetscFunctionBegin;
81   *type = MATSOLVERCUSPARSE;
82   PetscFunctionReturn(0);
83 }
84 
85 /*MC
86   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
87   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
88   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
89   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
90   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
91   algorithms are not recommended. This class does NOT support direct solver operations.
92 
93   Level: beginner
94 
95 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
96 M*/
97 
98 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
99 {
100   PetscErrorCode ierr;
101   PetscInt       n = A->rmap->n;
102 
103   PetscFunctionBegin;
104   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
105   (*B)->factortype = ftype;
106   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
107   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
108 
109   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
110     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
111     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
112     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
113   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
114     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
115     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
116   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
117 
118   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
119   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
120   PetscFunctionReturn(0);
121 }
122 
123 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
124 {
125   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
126 
127   PetscFunctionBegin;
128   switch (op) {
129   case MAT_CUSPARSE_MULT:
130     cusparsestruct->format = format;
131     break;
132   case MAT_CUSPARSE_ALL:
133     cusparsestruct->format = format;
134     break;
135   default:
136     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
137   }
138   PetscFunctionReturn(0);
139 }
140 
141 /*@
142    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
143    operation. Only the MatMult operation can use different GPU storage formats
144    for MPIAIJCUSPARSE matrices.
145    Not Collective
146 
147    Input Parameters:
148 +  A - Matrix of type SEQAIJCUSPARSE
149 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
150 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
151 
152    Output Parameter:
153 
154    Level: intermediate
155 
156 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
157 @*/
158 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
159 {
160   PetscErrorCode ierr;
161 
162   PetscFunctionBegin;
163   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
164   ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
165   PetscFunctionReturn(0);
166 }
167 
168 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
169 {
170   PetscErrorCode           ierr;
171   MatCUSPARSEStorageFormat format;
172   PetscBool                flg;
173   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
174 
175   PetscFunctionBegin;
176   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
177   if (A->factortype==MAT_FACTOR_NONE) {
178     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
179                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
180     if (flg) {
181       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);
182     }
183   }
184   ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
185                           "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
186   if (flg) {
187     ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
188   }
189   ierr = PetscOptionsTail();CHKERRQ(ierr);
190   PetscFunctionReturn(0);
191 
192 }
193 
194 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
195 {
196   PetscErrorCode ierr;
197 
198   PetscFunctionBegin;
199   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
200   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
201   PetscFunctionReturn(0);
202 }
203 
204 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
205 {
206   PetscErrorCode ierr;
207 
208   PetscFunctionBegin;
209   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
210   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
211   PetscFunctionReturn(0);
212 }
213 
214 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
215 {
216   PetscErrorCode ierr;
217 
218   PetscFunctionBegin;
219   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
220   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
221   PetscFunctionReturn(0);
222 }
223 
224 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
225 {
226   PetscErrorCode ierr;
227 
228   PetscFunctionBegin;
229   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
230   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
231   PetscFunctionReturn(0);
232 }
233 
234 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
235 {
236   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
237   PetscInt                          n = A->rmap->n;
238   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
239   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
240   cusparseStatus_t                  stat;
241   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
242   const MatScalar                   *aa = a->a,*v;
243   PetscInt                          *AiLo, *AjLo;
244   PetscScalar                       *AALo;
245   PetscInt                          i,nz, nzLower, offset, rowOffset;
246   PetscErrorCode                    ierr;
247 
248   PetscFunctionBegin;
249   if (!n) PetscFunctionReturn(0);
250   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
251     try {
252       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
253       nzLower=n+ai[n]-ai[1];
254 
255       /* Allocate Space for the lower triangular matrix */
256       ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
257       ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(ierr);
258       ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(ierr);
259 
260       /* Fill the lower triangular matrix */
261       AiLo[0]  = (PetscInt) 0;
262       AiLo[n]  = nzLower;
263       AjLo[0]  = (PetscInt) 0;
264       AALo[0]  = (MatScalar) 1.0;
265       v        = aa;
266       vi       = aj;
267       offset   = 1;
268       rowOffset= 1;
269       for (i=1; i<n; i++) {
270         nz = ai[i+1] - ai[i];
271         /* additional 1 for the term on the diagonal */
272         AiLo[i]    = rowOffset;
273         rowOffset += nz+1;
274 
275         ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
276         ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
277 
278         offset      += nz;
279         AjLo[offset] = (PetscInt) i;
280         AALo[offset] = (MatScalar) 1.0;
281         offset      += 1;
282 
283         v  += nz;
284         vi += nz;
285       }
286 
287       /* allocate space for the triangular factor information */
288       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
289 
290       /* Create the matrix description */
291       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat);
292       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
293       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
294       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUDA(stat);
295       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat);
296 
297       /* Create the solve analysis information */
298       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat);
299 
300       /* set the operation */
301       loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
302 
303       /* set the matrix */
304       loTriFactor->csrMat = new CsrMatrix;
305       loTriFactor->csrMat->num_rows = n;
306       loTriFactor->csrMat->num_cols = n;
307       loTriFactor->csrMat->num_entries = nzLower;
308 
309       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
310       loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
311 
312       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
313       loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
314 
315       loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
316       loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
317 
318       /* perform the solve analysis */
319       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
320                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
321                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
322                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat);
323 
324       /* assign the pointer. Is this really necessary? */
325       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
326 
327       ierr = cudaFreeHost(AiLo);CHKERRCUDA(ierr);
328       ierr = cudaFreeHost(AjLo);CHKERRCUDA(ierr);
329       ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr);
330       ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
331     } catch(char *ex) {
332       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
333     }
334   }
335   PetscFunctionReturn(0);
336 }
337 
338 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
339 {
340   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
341   PetscInt                          n = A->rmap->n;
342   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
343   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
344   cusparseStatus_t                  stat;
345   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
346   const MatScalar                   *aa = a->a,*v;
347   PetscInt                          *AiUp, *AjUp;
348   PetscScalar                       *AAUp;
349   PetscInt                          i,nz, nzUpper, offset;
350   PetscErrorCode                    ierr;
351 
352   PetscFunctionBegin;
353   if (!n) PetscFunctionReturn(0);
354   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
355     try {
356       /* next, figure out the number of nonzeros in the upper triangular matrix. */
357       nzUpper = adiag[0]-adiag[n];
358 
359       /* Allocate Space for the upper triangular matrix */
360       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
361       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr);
362       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
363 
364       /* Fill the upper triangular matrix */
365       AiUp[0]=(PetscInt) 0;
366       AiUp[n]=nzUpper;
367       offset = nzUpper;
368       for (i=n-1; i>=0; i--) {
369         v  = aa + adiag[i+1] + 1;
370         vi = aj + adiag[i+1] + 1;
371 
372         /* number of elements NOT on the diagonal */
373         nz = adiag[i] - adiag[i+1]-1;
374 
375         /* decrement the offset */
376         offset -= (nz+1);
377 
378         /* first, set the diagonal elements */
379         AjUp[offset] = (PetscInt) i;
380         AAUp[offset] = (MatScalar)1./v[nz];
381         AiUp[i]      = AiUp[i+1] - (nz+1);
382 
383         ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
384         ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
385       }
386 
387       /* allocate space for the triangular factor information */
388       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
389 
390       /* Create the matrix description */
391       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat);
392       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
393       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
394       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
395       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat);
396 
397       /* Create the solve analysis information */
398       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat);
399 
400       /* set the operation */
401       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
402 
403       /* set the matrix */
404       upTriFactor->csrMat = new CsrMatrix;
405       upTriFactor->csrMat->num_rows = n;
406       upTriFactor->csrMat->num_cols = n;
407       upTriFactor->csrMat->num_entries = nzUpper;
408 
409       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
410       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
411 
412       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
413       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
414 
415       upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
416       upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
417 
418       /* perform the solve analysis */
419       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
420                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
421                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
422                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat);
423 
424       /* assign the pointer. Is this really necessary? */
425       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
426 
427       ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr);
428       ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr);
429       ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr);
430       ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
431     } catch(char *ex) {
432       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
433     }
434   }
435   PetscFunctionReturn(0);
436 }
437 
438 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
439 {
440   PetscErrorCode               ierr;
441   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
442   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
443   IS                           isrow = a->row,iscol = a->icol;
444   PetscBool                    row_identity,col_identity;
445   const PetscInt               *r,*c;
446   PetscInt                     n = A->rmap->n;
447 
448   PetscFunctionBegin;
449   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
450   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
451 
452   cusparseTriFactors->workVector = new THRUSTARRAY(n);
453   cusparseTriFactors->nnz=a->nz;
454 
455   A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH;
456   /* lower triangular indices */
457   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
458   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
459   if (!row_identity) {
460     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
461     cusparseTriFactors->rpermIndices->assign(r, r+n);
462   }
463   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
464 
465   /* upper triangular indices */
466   ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
467   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
468   if (!col_identity) {
469     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
470     cusparseTriFactors->cpermIndices->assign(c, c+n);
471   }
472 
473   if (!row_identity && !col_identity) {
474     ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr);
475   } else if(!row_identity) {
476     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
477   } else if(!col_identity) {
478     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
479   }
480 
481   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
482   PetscFunctionReturn(0);
483 }
484 
485 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
486 {
487   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
488   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
489   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
490   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
491   cusparseStatus_t                  stat;
492   PetscErrorCode                    ierr;
493   PetscInt                          *AiUp, *AjUp;
494   PetscScalar                       *AAUp;
495   PetscScalar                       *AALo;
496   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
497   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
498   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
499   const MatScalar                   *aa = b->a,*v;
500 
501   PetscFunctionBegin;
502   if (!n) PetscFunctionReturn(0);
503   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
504     try {
505       /* Allocate Space for the upper triangular matrix */
506       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
507       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr);
508       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
509       ierr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
510 
511       /* Fill the upper triangular matrix */
512       AiUp[0]=(PetscInt) 0;
513       AiUp[n]=nzUpper;
514       offset = 0;
515       for (i=0; i<n; i++) {
516         /* set the pointers */
517         v  = aa + ai[i];
518         vj = aj + ai[i];
519         nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
520 
521         /* first, set the diagonal elements */
522         AjUp[offset] = (PetscInt) i;
523         AAUp[offset] = (MatScalar)1.0/v[nz];
524         AiUp[i]      = offset;
525         AALo[offset] = (MatScalar)1.0/v[nz];
526 
527         offset+=1;
528         if (nz>0) {
529           ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
530           ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
531           for (j=offset; j<offset+nz; j++) {
532             AAUp[j] = -AAUp[j];
533             AALo[j] = AAUp[j]/v[nz];
534           }
535           offset+=nz;
536         }
537       }
538 
539       /* allocate space for the triangular factor information */
540       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
541 
542       /* Create the matrix description */
543       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat);
544       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
545       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
546       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
547       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat);
548 
549       /* Create the solve analysis information */
550       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat);
551 
552       /* set the operation */
553       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
554 
555       /* set the matrix */
556       upTriFactor->csrMat = new CsrMatrix;
557       upTriFactor->csrMat->num_rows = A->rmap->n;
558       upTriFactor->csrMat->num_cols = A->cmap->n;
559       upTriFactor->csrMat->num_entries = a->nz;
560 
561       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
562       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
563 
564       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
565       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
566 
567       upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
568       upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
569 
570       /* perform the solve analysis */
571       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
572                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
573                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
574                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat);
575 
576       /* assign the pointer. Is this really necessary? */
577       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
578 
579       /* allocate space for the triangular factor information */
580       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
581 
582       /* Create the matrix description */
583       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat);
584       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
585       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
586       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
587       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat);
588 
589       /* Create the solve analysis information */
590       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat);
591 
592       /* set the operation */
593       loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
594 
595       /* set the matrix */
596       loTriFactor->csrMat = new CsrMatrix;
597       loTriFactor->csrMat->num_rows = A->rmap->n;
598       loTriFactor->csrMat->num_cols = A->cmap->n;
599       loTriFactor->csrMat->num_entries = a->nz;
600 
601       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
602       loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
603 
604       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
605       loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
606 
607       loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
608       loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
609       ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
610 
611       /* perform the solve analysis */
612       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
613                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
614                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
615                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat);
616 
617       /* assign the pointer. Is this really necessary? */
618       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
619 
620       A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH;
621       ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr);
622       ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr);
623       ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr);
624       ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr);
625     } catch(char *ex) {
626       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
627     }
628   }
629   PetscFunctionReturn(0);
630 }
631 
632 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
633 {
634   PetscErrorCode               ierr;
635   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
636   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
637   IS                           ip = a->row;
638   const PetscInt               *rip;
639   PetscBool                    perm_identity;
640   PetscInt                     n = A->rmap->n;
641 
642   PetscFunctionBegin;
643   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
644   cusparseTriFactors->workVector = new THRUSTARRAY(n);
645   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
646 
647   /* lower triangular indices */
648   ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
649   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
650   if (!perm_identity) {
651     IS             iip;
652     const PetscInt *irip;
653 
654     ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr);
655     ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr);
656     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
657     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
658     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
659     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
660     ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr);
661     ierr = ISDestroy(&iip);CHKERRQ(ierr);
662     ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr);
663   }
664   ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
665   PetscFunctionReturn(0);
666 }
667 
668 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
669 {
670   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
671   IS             isrow = b->row,iscol = b->col;
672   PetscBool      row_identity,col_identity;
673   PetscErrorCode ierr;
674 
675   PetscFunctionBegin;
676   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
677   /* determine which version of MatSolve needs to be used. */
678   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
679   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
680   if (row_identity && col_identity) {
681     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
682     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
683     B->ops->matsolve = NULL;
684     B->ops->matsolvetranspose = NULL;
685   } else {
686     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
687     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
688     B->ops->matsolve = NULL;
689     B->ops->matsolvetranspose = NULL;
690   }
691 
692   /* get the triangular factors */
693   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
694   PetscFunctionReturn(0);
695 }
696 
697 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
698 {
699   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
700   IS             ip = b->row;
701   PetscBool      perm_identity;
702   PetscErrorCode ierr;
703 
704   PetscFunctionBegin;
705   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
706 
707   /* determine which version of MatSolve needs to be used. */
708   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
709   if (perm_identity) {
710     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
711     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
712     B->ops->matsolve = NULL;
713     B->ops->matsolvetranspose = NULL;
714   } else {
715     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
716     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
717     B->ops->matsolve = NULL;
718     B->ops->matsolvetranspose = NULL;
719   }
720 
721   /* get the triangular factors */
722   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
723   PetscFunctionReturn(0);
724 }
725 
726 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
727 {
728   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
729   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
730   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
731   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
732   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
733   cusparseStatus_t                  stat;
734   cusparseIndexBase_t               indexBase;
735   cusparseMatrixType_t              matrixType;
736   cusparseFillMode_t                fillMode;
737   cusparseDiagType_t                diagType;
738 
739   PetscFunctionBegin;
740 
741   /*********************************************/
742   /* Now the Transpose of the Lower Tri Factor */
743   /*********************************************/
744 
745   /* allocate space for the transpose of the lower triangular factor */
746   loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
747 
748   /* set the matrix descriptors of the lower triangular factor */
749   matrixType = cusparseGetMatType(loTriFactor->descr);
750   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
751   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
752     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
753   diagType = cusparseGetMatDiagType(loTriFactor->descr);
754 
755   /* Create the matrix description */
756   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUDA(stat);
757   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUDA(stat);
758   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUDA(stat);
759   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUDA(stat);
760   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUDA(stat);
761 
762   /* Create the solve analysis information */
763   stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUDA(stat);
764 
765   /* set the operation */
766   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
767 
768   /* allocate GPU space for the CSC of the lower triangular factor*/
769   loTriFactorT->csrMat = new CsrMatrix;
770   loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows;
771   loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols;
772   loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
773   loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1);
774   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries);
775   loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries);
776 
777   /* compute the transpose of the lower triangular factor, i.e. the CSC */
778   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
779                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
780                           loTriFactor->csrMat->values->data().get(),
781                           loTriFactor->csrMat->row_offsets->data().get(),
782                           loTriFactor->csrMat->column_indices->data().get(),
783                           loTriFactorT->csrMat->values->data().get(),
784                           loTriFactorT->csrMat->column_indices->data().get(),
785                           loTriFactorT->csrMat->row_offsets->data().get(),
786                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
787 
788   /* perform the solve analysis on the transposed matrix */
789   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
790                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries,
791                            loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
792                            loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(),
793                            loTriFactorT->solveInfo);CHKERRCUDA(stat);
794 
795   /* assign the pointer. Is this really necessary? */
796   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
797 
798   /*********************************************/
799   /* Now the Transpose of the Upper Tri Factor */
800   /*********************************************/
801 
802   /* allocate space for the transpose of the upper triangular factor */
803   upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
804 
805   /* set the matrix descriptors of the upper triangular factor */
806   matrixType = cusparseGetMatType(upTriFactor->descr);
807   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
808   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
809     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
810   diagType = cusparseGetMatDiagType(upTriFactor->descr);
811 
812   /* Create the matrix description */
813   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUDA(stat);
814   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUDA(stat);
815   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUDA(stat);
816   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUDA(stat);
817   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUDA(stat);
818 
819   /* Create the solve analysis information */
820   stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUDA(stat);
821 
822   /* set the operation */
823   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
824 
825   /* allocate GPU space for the CSC of the upper triangular factor*/
826   upTriFactorT->csrMat = new CsrMatrix;
827   upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows;
828   upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols;
829   upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
830   upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1);
831   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries);
832   upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries);
833 
834   /* compute the transpose of the upper triangular factor, i.e. the CSC */
835   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
836                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
837                           upTriFactor->csrMat->values->data().get(),
838                           upTriFactor->csrMat->row_offsets->data().get(),
839                           upTriFactor->csrMat->column_indices->data().get(),
840                           upTriFactorT->csrMat->values->data().get(),
841                           upTriFactorT->csrMat->column_indices->data().get(),
842                           upTriFactorT->csrMat->row_offsets->data().get(),
843                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
844 
845   /* perform the solve analysis on the transposed matrix */
846   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
847                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries,
848                            upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
849                            upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(),
850                            upTriFactorT->solveInfo);CHKERRCUDA(stat);
851 
852   /* assign the pointer. Is this really necessary? */
853   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
854   PetscFunctionReturn(0);
855 }
856 
857 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
858 {
859   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
860   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
861   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
862   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
863   cusparseStatus_t             stat;
864   cusparseIndexBase_t          indexBase;
865   cudaError_t                  err;
866   PetscErrorCode               ierr;
867 
868   PetscFunctionBegin;
869 
870   /* allocate space for the triangular factor information */
871   matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
872   stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUDA(stat);
873   indexBase = cusparseGetMatIndexBase(matstruct->descr);
874   stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUDA(stat);
875   stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat);
876 
877   /* set alpha and beta */
878   err = cudaMalloc((void **)&(matstructT->alpha),    sizeof(PetscScalar));CHKERRCUDA(err);
879   err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
880   err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
881   err = cudaMemcpy(matstructT->alpha,    &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
882   err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
883   err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
884   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
885 
886   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
887     CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
888     CsrMatrix *matrixT= new CsrMatrix;
889     thrust::device_vector<int> *rowoffsets_gpu;
890     const int ncomprow = matstruct->cprowIndices->size();
891     thrust::host_vector<int> cprow(ncomprow);
892     cprow = *matstruct->cprowIndices; // GPU --> CPU
893     matrixT->num_rows = A->cmap->n;
894     matrixT->num_cols = A->rmap->n;
895     matrixT->num_entries = a->nz;
896     matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
897     matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
898     matrixT->values = new THRUSTARRAY(a->nz);
899     PetscInt k,i;
900     thrust::host_vector<int> rowst_host(A->rmap->n+1);
901 
902     /* expand compress rows, which is forced in constructor (MatSeqAIJCUSPARSECopyToGPU) */
903     rowst_host[0] = 0;
904     for (k = 0, i = 0; i < A->rmap->n ; i++) {
905       if (k < ncomprow && i==cprow[k]) {
906 	rowst_host[i+1] = a->i[i+1];
907 	k++;
908       } else rowst_host[i+1] = rowst_host[i];
909     }
910     if (k!=ncomprow) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"%D != %D",k,ncomprow);
911     rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
912     *rowoffsets_gpu = rowst_host; // CPU --> GPU
913 
914     /* compute the transpose of the upper triangular factor, i.e. the CSC */
915     indexBase = cusparseGetMatIndexBase(matstruct->descr);
916     stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
917                             A->cmap->n, matrix->num_entries,
918                             matrix->values->data().get(),
919                             rowoffsets_gpu->data().get(),
920                             matrix->column_indices->data().get(),
921                             matrixT->values->data().get(),
922                             matrixT->column_indices->data().get(),
923                             matrixT->row_offsets->data().get(),
924                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
925 
926     /* assign the pointer */
927     matstructT->mat = matrixT;
928     ierr = PetscLogCpuToGpu(((A->rmap->n+1)+(a->nz))*sizeof(int)+(3+a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
929   } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
930     /* First convert HYB to CSR */
931     CsrMatrix *temp= new CsrMatrix;
932     temp->num_rows = A->rmap->n;
933     temp->num_cols = A->cmap->n;
934     temp->num_entries = a->nz;
935     temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
936     temp->column_indices = new THRUSTINTARRAY32(a->nz);
937     temp->values = new THRUSTARRAY(a->nz);
938 
939 
940     stat = cusparse_hyb2csr(cusparsestruct->handle,
941                             matstruct->descr, (cusparseHybMat_t)matstruct->mat,
942                             temp->values->data().get(),
943                             temp->row_offsets->data().get(),
944                             temp->column_indices->data().get());CHKERRCUDA(stat);
945 
946     /* Next, convert CSR to CSC (i.e. the matrix transpose) */
947     CsrMatrix *tempT= new CsrMatrix;
948     tempT->num_rows = A->rmap->n;
949     tempT->num_cols = A->cmap->n;
950     tempT->num_entries = a->nz;
951     tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
952     tempT->column_indices = new THRUSTINTARRAY32(a->nz);
953     tempT->values = new THRUSTARRAY(a->nz);
954 
955     stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
956                             temp->num_cols, temp->num_entries,
957                             temp->values->data().get(),
958                             temp->row_offsets->data().get(),
959                             temp->column_indices->data().get(),
960                             tempT->values->data().get(),
961                             tempT->column_indices->data().get(),
962                             tempT->row_offsets->data().get(),
963                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
964 
965     /* Last, convert CSC to HYB */
966     cusparseHybMat_t hybMat;
967     stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat);
968     cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
969       CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
970     stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
971                             matstructT->descr, tempT->values->data().get(),
972                             tempT->row_offsets->data().get(),
973                             tempT->column_indices->data().get(),
974                             hybMat, 0, partition);CHKERRCUDA(stat);
975 
976     /* assign the pointer */
977     matstructT->mat = hybMat;
978     ierr = PetscLogCpuToGpu((2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)))+3*sizeof(PetscScalar));CHKERRQ(ierr);
979 
980     /* delete temporaries */
981     if (tempT) {
982       if (tempT->values) delete (THRUSTARRAY*) tempT->values;
983       if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
984       if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
985       delete (CsrMatrix*) tempT;
986     }
987     if (temp) {
988       if (temp->values) delete (THRUSTARRAY*) temp->values;
989       if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
990       if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
991       delete (CsrMatrix*) temp;
992     }
993   }
994   /* assign the compressed row indices */
995   matstructT->cprowIndices = new THRUSTINTARRAY;
996   matstructT->cprowIndices->resize(A->cmap->n);
997   thrust::sequence(matstructT->cprowIndices->begin(), matstructT->cprowIndices->end());
998   /* assign the pointer */
999   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1000   PetscFunctionReturn(0);
1001 }
1002 
1003 /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1004 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1005 {
1006   PetscInt                              n = xx->map->n;
1007   const PetscScalar                     *barray;
1008   PetscScalar                           *xarray;
1009   thrust::device_ptr<const PetscScalar> bGPU;
1010   thrust::device_ptr<PetscScalar>       xGPU;
1011   cusparseStatus_t                      stat;
1012   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1013   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1014   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1015   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1016   PetscErrorCode                        ierr;
1017 
1018   PetscFunctionBegin;
1019   /* Analyze the matrix and create the transpose ... on the fly */
1020   if (!loTriFactorT && !upTriFactorT) {
1021     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1022     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1023     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1024   }
1025 
1026   /* Get the GPU pointers */
1027   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1028   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1029   xGPU = thrust::device_pointer_cast(xarray);
1030   bGPU = thrust::device_pointer_cast(barray);
1031 
1032   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1033   /* First, reorder with the row permutation */
1034   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1035                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1036                xGPU);
1037 
1038   /* First, solve U */
1039   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1040                         upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1041                         upTriFactorT->csrMat->values->data().get(),
1042                         upTriFactorT->csrMat->row_offsets->data().get(),
1043                         upTriFactorT->csrMat->column_indices->data().get(),
1044                         upTriFactorT->solveInfo,
1045                         xarray, tempGPU->data().get());CHKERRCUDA(stat);
1046 
1047   /* Then, solve L */
1048   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1049                         loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1050                         loTriFactorT->csrMat->values->data().get(),
1051                         loTriFactorT->csrMat->row_offsets->data().get(),
1052                         loTriFactorT->csrMat->column_indices->data().get(),
1053                         loTriFactorT->solveInfo,
1054                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1055 
1056   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1057   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1058                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1059                tempGPU->begin());
1060 
1061   /* Copy the temporary to the full solution. */
1062   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1063 
1064   /* restore */
1065   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1066   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1067   ierr = WaitForGPU();CHKERRCUDA(ierr);
1068   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1069   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1070   PetscFunctionReturn(0);
1071 }
1072 
1073 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1074 {
1075   const PetscScalar                 *barray;
1076   PetscScalar                       *xarray;
1077   cusparseStatus_t                  stat;
1078   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1079   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1080   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1081   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1082   PetscErrorCode                    ierr;
1083 
1084   PetscFunctionBegin;
1085   /* Analyze the matrix and create the transpose ... on the fly */
1086   if (!loTriFactorT && !upTriFactorT) {
1087     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1088     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1089     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1090   }
1091 
1092   /* Get the GPU pointers */
1093   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1094   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1095 
1096   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1097   /* First, solve U */
1098   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1099                         upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1100                         upTriFactorT->csrMat->values->data().get(),
1101                         upTriFactorT->csrMat->row_offsets->data().get(),
1102                         upTriFactorT->csrMat->column_indices->data().get(),
1103                         upTriFactorT->solveInfo,
1104                         barray, tempGPU->data().get());CHKERRCUDA(stat);
1105 
1106   /* Then, solve L */
1107   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1108                         loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1109                         loTriFactorT->csrMat->values->data().get(),
1110                         loTriFactorT->csrMat->row_offsets->data().get(),
1111                         loTriFactorT->csrMat->column_indices->data().get(),
1112                         loTriFactorT->solveInfo,
1113                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1114 
1115   /* restore */
1116   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1117   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1118   ierr = WaitForGPU();CHKERRCUDA(ierr);
1119   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1120   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1121   PetscFunctionReturn(0);
1122 }
1123 
1124 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1125 {
1126   const PetscScalar                     *barray;
1127   PetscScalar                           *xarray;
1128   thrust::device_ptr<const PetscScalar> bGPU;
1129   thrust::device_ptr<PetscScalar>       xGPU;
1130   cusparseStatus_t                      stat;
1131   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1132   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1133   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1134   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1135   PetscErrorCode                        ierr;
1136 
1137   PetscFunctionBegin;
1138 
1139   /* Get the GPU pointers */
1140   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1141   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1142   xGPU = thrust::device_pointer_cast(xarray);
1143   bGPU = thrust::device_pointer_cast(barray);
1144 
1145   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1146   /* First, reorder with the row permutation */
1147   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1148                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1149                tempGPU->begin());
1150 
1151   /* Next, solve L */
1152   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1153                         loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1154                         loTriFactor->csrMat->values->data().get(),
1155                         loTriFactor->csrMat->row_offsets->data().get(),
1156                         loTriFactor->csrMat->column_indices->data().get(),
1157                         loTriFactor->solveInfo,
1158                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1159 
1160   /* Then, solve U */
1161   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1162                         upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1163                         upTriFactor->csrMat->values->data().get(),
1164                         upTriFactor->csrMat->row_offsets->data().get(),
1165                         upTriFactor->csrMat->column_indices->data().get(),
1166                         upTriFactor->solveInfo,
1167                         xarray, tempGPU->data().get());CHKERRCUDA(stat);
1168 
1169   /* Last, reorder with the column permutation */
1170   thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1171                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
1172                xGPU);
1173 
1174   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1175   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1176   ierr = WaitForGPU();CHKERRCUDA(ierr);
1177   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1178   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1179   PetscFunctionReturn(0);
1180 }
1181 
1182 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1183 {
1184   const PetscScalar                 *barray;
1185   PetscScalar                       *xarray;
1186   cusparseStatus_t                  stat;
1187   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1188   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1189   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1190   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1191   PetscErrorCode                    ierr;
1192 
1193   PetscFunctionBegin;
1194   /* Get the GPU pointers */
1195   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1196   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1197 
1198   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1199   /* First, solve L */
1200   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1201                         loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1202                         loTriFactor->csrMat->values->data().get(),
1203                         loTriFactor->csrMat->row_offsets->data().get(),
1204                         loTriFactor->csrMat->column_indices->data().get(),
1205                         loTriFactor->solveInfo,
1206                         barray, tempGPU->data().get());CHKERRCUDA(stat);
1207 
1208   /* Next, solve U */
1209   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1210                         upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1211                         upTriFactor->csrMat->values->data().get(),
1212                         upTriFactor->csrMat->row_offsets->data().get(),
1213                         upTriFactor->csrMat->column_indices->data().get(),
1214                         upTriFactor->solveInfo,
1215                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1216 
1217   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1218   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1219   ierr = WaitForGPU();CHKERRCUDA(ierr);
1220   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1221   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1222   PetscFunctionReturn(0);
1223 }
1224 
1225 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1226 {
1227   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1228   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1229   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1230   PetscInt                     m = A->rmap->n,*ii,*ridx;
1231   PetscErrorCode               ierr;
1232   cusparseStatus_t             stat;
1233   cudaError_t                  err;
1234 
1235   PetscFunctionBegin;
1236   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
1237     ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1238     if (A->assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) {
1239       CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
1240       /* copy values only */
1241       matrix->values->assign(a->a, a->a+a->nz);
1242       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
1243     } else {
1244       MatSeqAIJCUSPARSEMultStruct_Destroy(&matstruct,cusparsestruct->format);
1245       try {
1246         cusparsestruct->nonzerorow=0;
1247         for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0);
1248 
1249         if (a->compressedrow.use) {
1250           m    = a->compressedrow.nrows;
1251           ii   = a->compressedrow.i;
1252           ridx = a->compressedrow.rindex;
1253         } else {
1254           /* Forcing compressed row on the GPU */
1255           int k=0;
1256           ierr = PetscMalloc1(cusparsestruct->nonzerorow+1, &ii);CHKERRQ(ierr);
1257           ierr = PetscMalloc1(cusparsestruct->nonzerorow, &ridx);CHKERRQ(ierr);
1258           ii[0]=0;
1259           for (int j = 0; j<m; j++) {
1260             if ((a->i[j+1]-a->i[j])>0) {
1261               ii[k]  = a->i[j];
1262               ridx[k]= j;
1263               k++;
1264             }
1265           }
1266           ii[cusparsestruct->nonzerorow] = a->nz;
1267           m = cusparsestruct->nonzerorow;
1268         }
1269 
1270         /* allocate space for the triangular factor information */
1271         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1272         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUDA(stat);
1273         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
1274         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat);
1275 
1276         err = cudaMalloc((void **)&(matstruct->alpha),    sizeof(PetscScalar));CHKERRCUDA(err);
1277         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1278         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1279         err = cudaMemcpy(matstruct->alpha,    &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1280         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1281         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1282         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
1283 
1284         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1285         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1286           /* set the matrix */
1287           CsrMatrix *matrix= new CsrMatrix;
1288           matrix->num_rows = m;
1289           matrix->num_cols = A->cmap->n;
1290           matrix->num_entries = a->nz;
1291           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1292           matrix->row_offsets->assign(ii, ii + m+1);
1293 
1294           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1295           matrix->column_indices->assign(a->j, a->j+a->nz);
1296 
1297           matrix->values = new THRUSTARRAY(a->nz);
1298           matrix->values->assign(a->a, a->a+a->nz);
1299 
1300           /* assign the pointer */
1301           matstruct->mat = matrix;
1302 
1303         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1304           CsrMatrix *matrix= new CsrMatrix;
1305           matrix->num_rows = m;
1306           matrix->num_cols = A->cmap->n;
1307           matrix->num_entries = a->nz;
1308           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1309           matrix->row_offsets->assign(ii, ii + m+1);
1310 
1311           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1312           matrix->column_indices->assign(a->j, a->j+a->nz);
1313 
1314           matrix->values = new THRUSTARRAY(a->nz);
1315           matrix->values->assign(a->a, a->a+a->nz);
1316 
1317           cusparseHybMat_t hybMat;
1318           stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat);
1319           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1320             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1321           stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols,
1322               matstruct->descr, matrix->values->data().get(),
1323               matrix->row_offsets->data().get(),
1324               matrix->column_indices->data().get(),
1325               hybMat, 0, partition);CHKERRCUDA(stat);
1326           /* assign the pointer */
1327           matstruct->mat = hybMat;
1328 
1329           if (matrix) {
1330             if (matrix->values) delete (THRUSTARRAY*)matrix->values;
1331             if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices;
1332             if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets;
1333             delete (CsrMatrix*)matrix;
1334           }
1335         }
1336 
1337         /* assign the compressed row indices */
1338         matstruct->cprowIndices = new THRUSTINTARRAY(m);
1339         matstruct->cprowIndices->assign(ridx,ridx+m);
1340         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+m*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
1341 
1342         /* assign the pointer */
1343         cusparsestruct->mat = matstruct;
1344 
1345         if (!a->compressedrow.use) {
1346           ierr = PetscFree(ii);CHKERRQ(ierr);
1347           ierr = PetscFree(ridx);CHKERRQ(ierr);
1348         }
1349         cusparsestruct->workVector = new THRUSTARRAY(m);
1350       } catch(char *ex) {
1351         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1352       }
1353       cusparsestruct->nonzerostate = A->nonzerostate;
1354     }
1355     ierr = WaitForGPU();CHKERRCUDA(ierr);
1356     A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH;
1357     ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1358   }
1359   PetscFunctionReturn(0);
1360 }
1361 
1362 struct VecCUDAPlusEquals
1363 {
1364   template <typename Tuple>
1365   __host__ __device__
1366   void operator()(Tuple t)
1367   {
1368     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1369   }
1370 };
1371 
1372 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1373 {
1374   PetscErrorCode ierr;
1375 
1376   PetscFunctionBegin;
1377   ierr = MatMultAdd_SeqAIJCUSPARSE(A,xx,NULL,yy);CHKERRQ(ierr);
1378   PetscFunctionReturn(0);
1379 }
1380 
1381 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1382 {
1383   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1384   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1385   Mat_SeqAIJCUSPARSEMultStruct *matstructT;
1386   const PetscScalar            *xarray;
1387   PetscScalar                  *yarray;
1388   PetscErrorCode               ierr;
1389   cusparseStatus_t             stat;
1390 
1391   PetscFunctionBegin;
1392   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1393   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1394   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1395   if (!matstructT) {
1396     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1397     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1398   }
1399   ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1400   ierr = VecCUDAGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
1401 
1402   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1403   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1404     CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1405     stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1406                              mat->num_rows, mat->num_cols,
1407                              mat->num_entries, matstructT->alpha, matstructT->descr,
1408                              mat->values->data().get(), mat->row_offsets->data().get(),
1409                              mat->column_indices->data().get(), xarray, matstructT->beta_zero,
1410                              yarray);CHKERRCUDA(stat);
1411   } else {
1412     cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1413     stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1414                              matstructT->alpha, matstructT->descr, hybMat,
1415                              xarray, matstructT->beta_zero,
1416                              yarray);CHKERRCUDA(stat);
1417   }
1418   ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1419   ierr = VecCUDARestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
1420   ierr = WaitForGPU();CHKERRCUDA(ierr);
1421   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1422   ierr = PetscLogGpuFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr);
1423   PetscFunctionReturn(0);
1424 }
1425 
1426 
1427 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1428 {
1429   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1430   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1431   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
1432   const PetscScalar            *xarray;
1433   PetscScalar                  *zarray,*dptr,*beta;
1434   PetscErrorCode               ierr;
1435   cusparseStatus_t             stat;
1436   PetscBool                    cmpr; /* if the matrix has been compressed (zero rows) */
1437 
1438   PetscFunctionBegin;
1439   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1440   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1441   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1442   try {
1443     cmpr = (PetscBool)(cusparsestruct->workVector->size() == (thrust::detail::vector_base<PetscScalar, thrust::device_malloc_allocator<PetscScalar> >::size_type)(A->rmap->n));
1444     ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1445     if (yy && !cmpr) { /* MatMultAdd with noncompressed storage -> need uptodate zz vector */
1446       ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);
1447     } else {
1448       ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
1449     }
1450     dptr = cmpr ? zarray : cusparsestruct->workVector->data().get();
1451     beta = (yy == zz && dptr == zarray) ? matstruct->beta_one : matstruct->beta_zero;
1452 
1453     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1454     /* csr_spmv is multiply add */
1455     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1456       /* here we need to be careful to set the number of rows in the multiply to the
1457          number of compressed rows in the matrix ... which is equivalent to the
1458          size of the workVector */
1459       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1460       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1461                                mat->num_rows, mat->num_cols,
1462                                mat->num_entries, matstruct->alpha, matstruct->descr,
1463                                mat->values->data().get(), mat->row_offsets->data().get(),
1464                                mat->column_indices->data().get(), xarray, beta,
1465                                dptr);CHKERRCUDA(stat);
1466     } else {
1467       if (cusparsestruct->workVector->size()) {
1468 	cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1469         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1470                                  matstruct->alpha, matstruct->descr, hybMat,
1471                                  xarray, beta,
1472                                  dptr);CHKERRCUDA(stat);
1473       }
1474     }
1475     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1476 
1477     if (yy) {
1478       if (dptr != zarray) {
1479         ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);
1480       } else if (zz != yy) {
1481         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr);
1482       }
1483     } else if (dptr != zarray) {
1484       ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
1485     }
1486     /* scatter the data from the temporary into the full vector with a += operation */
1487     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1488     if (dptr != zarray) {
1489       thrust::device_ptr<PetscScalar> zptr;
1490 
1491       zptr = thrust::device_pointer_cast(zarray);
1492       thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
1493                        thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(),
1494                        VecCUDAPlusEquals());
1495     }
1496     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1497     ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1498     if (yy && !cmpr) {
1499       ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);
1500     } else {
1501       ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
1502     }
1503   } catch(char *ex) {
1504     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1505   }
1506   if (!yy) { /* MatMult */
1507     if (!cusparsestruct->stream) {
1508       ierr = WaitForGPU();CHKERRCUDA(ierr);
1509     }
1510   }
1511   ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
1512   PetscFunctionReturn(0);
1513 }
1514 
1515 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1516 {
1517   Mat_SeqAIJ                      *a = (Mat_SeqAIJ*)A->data;
1518   Mat_SeqAIJCUSPARSE              *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1519   Mat_SeqAIJCUSPARSEMultStruct    *matstructT;
1520   const PetscScalar               *xarray;
1521   PetscScalar                     *zarray,*dptr,*beta;
1522   PetscErrorCode                  ierr;
1523   cusparseStatus_t                stat;
1524 
1525   PetscFunctionBegin;
1526   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1527   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1528   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1529   if (!matstructT) {
1530     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1531     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1532   }
1533 
1534   try {
1535     ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);
1536     ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1537     ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);
1538     dptr = cusparsestruct->workVector->size() == (thrust::detail::vector_base<PetscScalar, thrust::device_malloc_allocator<PetscScalar> >::size_type)(A->cmap->n) ? zarray : cusparsestruct->workVector->data().get();
1539     beta = (yy == zz && dptr == zarray) ? matstructT->beta_one : matstructT->beta_zero;
1540 
1541     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1542     /* multiply add with matrix transpose */
1543     if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1544       CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1545       /* here we need to be careful to set the number of rows in the multiply to the
1546          number of compressed rows in the matrix ... which is equivalent to the
1547          size of the workVector */
1548       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1549                                mat->num_rows, mat->num_cols,
1550                                mat->num_entries, matstructT->alpha, matstructT->descr,
1551                                mat->values->data().get(), mat->row_offsets->data().get(),
1552                                mat->column_indices->data().get(), xarray, beta,
1553                                dptr);CHKERRCUDA(stat);
1554     } else {
1555       cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1556       if (cusparsestruct->workVector->size()) {
1557         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1558                                  matstructT->alpha, matstructT->descr, hybMat,
1559                                  xarray, beta,
1560                                  dptr);CHKERRCUDA(stat);
1561       }
1562     }
1563     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1564 
1565     if (dptr != zarray) {
1566       ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);
1567     } else if (zz != yy) {
1568       ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr);
1569     }
1570     /* scatter the data from the temporary into the full vector with a += operation */
1571     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1572     if (dptr != zarray) {
1573       thrust::device_ptr<PetscScalar> zptr;
1574 
1575       zptr = thrust::device_pointer_cast(zarray);
1576 
1577       /* scatter the data from the temporary into the full vector with a += operation */
1578       thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))),
1579                        thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))) + A->cmap->n,
1580                        VecCUDAPlusEquals());
1581     }
1582     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1583     ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1584     ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);
1585   } catch(char *ex) {
1586     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1587   }
1588   ierr = WaitForGPU();CHKERRCUDA(ierr); /* is this needed? just for yy==0 in Mult */
1589   ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
1590   PetscFunctionReturn(0);
1591 }
1592 
1593 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
1594 {
1595   PetscErrorCode ierr;
1596 
1597   PetscFunctionBegin;
1598   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
1599   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
1600   if (A->factortype == MAT_FACTOR_NONE) {
1601     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1602   }
1603   A->ops->mult             = MatMult_SeqAIJCUSPARSE;
1604   A->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1605   A->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1606   A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1607   PetscFunctionReturn(0);
1608 }
1609 
1610 /* --------------------------------------------------------------------------------*/
1611 /*@
1612    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
1613    (the default parallel PETSc format). This matrix will ultimately pushed down
1614    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
1615    assembly performance the user should preallocate the matrix storage by setting
1616    the parameter nz (or the array nnz).  By setting these parameters accurately,
1617    performance during matrix assembly can be increased by more than a factor of 50.
1618 
1619    Collective
1620 
1621    Input Parameters:
1622 +  comm - MPI communicator, set to PETSC_COMM_SELF
1623 .  m - number of rows
1624 .  n - number of columns
1625 .  nz - number of nonzeros per row (same for all rows)
1626 -  nnz - array containing the number of nonzeros in the various rows
1627          (possibly different for each row) or NULL
1628 
1629    Output Parameter:
1630 .  A - the matrix
1631 
1632    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1633    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1634    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
1635 
1636    Notes:
1637    If nnz is given then nz is ignored
1638 
1639    The AIJ format (also called the Yale sparse matrix format or
1640    compressed row storage), is fully compatible with standard Fortran 77
1641    storage.  That is, the stored row and column indices can begin at
1642    either one (as in Fortran) or zero.  See the users' manual for details.
1643 
1644    Specify the preallocated storage with either nz or nnz (not both).
1645    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1646    allocation.  For large problems you MUST preallocate memory or you
1647    will get TERRIBLE performance, see the users' manual chapter on matrices.
1648 
1649    By default, this format uses inodes (identical nodes) when possible, to
1650    improve numerical efficiency of matrix-vector products and solves. We
1651    search for consecutive rows with the same nonzero structure, thereby
1652    reusing matrix information to achieve increased efficiency.
1653 
1654    Level: intermediate
1655 
1656 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
1657 @*/
1658 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1659 {
1660   PetscErrorCode ierr;
1661 
1662   PetscFunctionBegin;
1663   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1664   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
1665   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1666   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
1667   PetscFunctionReturn(0);
1668 }
1669 
1670 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
1671 {
1672   PetscErrorCode   ierr;
1673 
1674   PetscFunctionBegin;
1675   if (A->factortype==MAT_FACTOR_NONE) {
1676     if (A->valid_GPU_matrix != PETSC_OFFLOAD_UNALLOCATED) {
1677       ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
1678     }
1679   } else {
1680     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
1681   }
1682   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
1683   PetscFunctionReturn(0);
1684 }
1685 
1686 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
1687 {
1688   PetscErrorCode ierr;
1689   Mat C;
1690   cusparseStatus_t stat;
1691   cusparseHandle_t handle=0;
1692 
1693   PetscFunctionBegin;
1694   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
1695   C    = *B;
1696   ierr = PetscFree(C->defaultvectype);CHKERRQ(ierr);
1697   ierr = PetscStrallocpy(VECCUDA,&C->defaultvectype);CHKERRQ(ierr);
1698 
1699   /* inject CUSPARSE-specific stuff */
1700   if (C->factortype==MAT_FACTOR_NONE) {
1701     /* you cannot check the inode.use flag here since the matrix was just created.
1702        now build a GPU matrix data structure */
1703     C->spptr = new Mat_SeqAIJCUSPARSE;
1704     ((Mat_SeqAIJCUSPARSE*)C->spptr)->mat          = 0;
1705     ((Mat_SeqAIJCUSPARSE*)C->spptr)->matTranspose = 0;
1706     ((Mat_SeqAIJCUSPARSE*)C->spptr)->workVector   = 0;
1707     ((Mat_SeqAIJCUSPARSE*)C->spptr)->format       = MAT_CUSPARSE_CSR;
1708     ((Mat_SeqAIJCUSPARSE*)C->spptr)->stream       = 0;
1709     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1710     ((Mat_SeqAIJCUSPARSE*)C->spptr)->handle       = handle;
1711     ((Mat_SeqAIJCUSPARSE*)C->spptr)->nonzerostate = 0;
1712   } else {
1713     /* NEXT, set the pointers to the triangular factors */
1714     C->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1715     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtr          = 0;
1716     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtr          = 0;
1717     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtrTranspose = 0;
1718     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtrTranspose = 0;
1719     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->rpermIndices            = 0;
1720     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->cpermIndices            = 0;
1721     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->workVector              = 0;
1722     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle                  = 0;
1723     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1724     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle                  = handle;
1725     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->nnz                     = 0;
1726   }
1727 
1728   C->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
1729   C->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
1730   C->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
1731   C->ops->mult             = MatMult_SeqAIJCUSPARSE;
1732   C->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1733   C->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1734   C->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1735   C->ops->duplicate        = MatDuplicate_SeqAIJCUSPARSE;
1736 
1737   ierr = PetscObjectChangeTypeName((PetscObject)C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1738 
1739   C->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED;
1740 
1741   ierr = PetscObjectComposeFunction((PetscObject)C, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
1742   PetscFunctionReturn(0);
1743 }
1744 
1745 PETSC_EXTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat B)
1746 {
1747   PetscErrorCode ierr;
1748   cusparseStatus_t stat;
1749   cusparseHandle_t handle=0;
1750 
1751   PetscFunctionBegin;
1752   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
1753   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
1754 
1755   if (B->factortype==MAT_FACTOR_NONE) {
1756     /* you cannot check the inode.use flag here since the matrix was just created.
1757        now build a GPU matrix data structure */
1758     B->spptr = new Mat_SeqAIJCUSPARSE;
1759     ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat          = 0;
1760     ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0;
1761     ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector   = 0;
1762     ((Mat_SeqAIJCUSPARSE*)B->spptr)->format       = MAT_CUSPARSE_CSR;
1763     ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream       = 0;
1764     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1765     ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle       = handle;
1766     ((Mat_SeqAIJCUSPARSE*)B->spptr)->nonzerostate = 0;
1767   } else {
1768     /* NEXT, set the pointers to the triangular factors */
1769     B->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1770     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr          = 0;
1771     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr          = 0;
1772     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0;
1773     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0;
1774     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices            = 0;
1775     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices            = 0;
1776     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector              = 0;
1777     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1778     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle                  = handle;
1779     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz                     = 0;
1780   }
1781 
1782   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
1783   B->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
1784   B->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
1785   B->ops->mult             = MatMult_SeqAIJCUSPARSE;
1786   B->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1787   B->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1788   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1789   B->ops->duplicate        = MatDuplicate_SeqAIJCUSPARSE;
1790 
1791   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1792 
1793   B->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED;
1794 
1795   ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
1796   PetscFunctionReturn(0);
1797 }
1798 
1799 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
1800 {
1801   PetscErrorCode ierr;
1802 
1803   PetscFunctionBegin;
1804   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
1805   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B);CHKERRQ(ierr);
1806   PetscFunctionReturn(0);
1807 }
1808 
1809 /*MC
1810    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
1811 
1812    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
1813    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
1814    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
1815 
1816    Options Database Keys:
1817 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
1818 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1819 -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1820 
1821   Level: beginner
1822 
1823 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
1824 M*/
1825 
1826 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);
1827 
1828 
1829 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
1830 {
1831   PetscErrorCode ierr;
1832 
1833   PetscFunctionBegin;
1834   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1835   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1836   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1837   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1838   PetscFunctionReturn(0);
1839 }
1840 
1841 
1842 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
1843 {
1844   cusparseStatus_t stat;
1845   cusparseHandle_t handle;
1846 
1847   PetscFunctionBegin;
1848   if (*cusparsestruct) {
1849     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);
1850     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);
1851     delete (*cusparsestruct)->workVector;
1852     if (handle = (*cusparsestruct)->handle) {
1853       stat = cusparseDestroy(handle);CHKERRCUDA(stat);
1854     }
1855     delete *cusparsestruct;
1856     *cusparsestruct = 0;
1857   }
1858   PetscFunctionReturn(0);
1859 }
1860 
1861 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
1862 {
1863   PetscFunctionBegin;
1864   if (*mat) {
1865     delete (*mat)->values;
1866     delete (*mat)->column_indices;
1867     delete (*mat)->row_offsets;
1868     delete *mat;
1869     *mat = 0;
1870   }
1871   PetscFunctionReturn(0);
1872 }
1873 
1874 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
1875 {
1876   cusparseStatus_t stat;
1877   PetscErrorCode   ierr;
1878 
1879   PetscFunctionBegin;
1880   if (*trifactor) {
1881     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUDA(stat); }
1882     if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUDA(stat); }
1883     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
1884     delete *trifactor;
1885     *trifactor = 0;
1886   }
1887   PetscFunctionReturn(0);
1888 }
1889 
1890 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
1891 {
1892   CsrMatrix        *mat;
1893   cusparseStatus_t stat;
1894   cudaError_t      err;
1895 
1896   PetscFunctionBegin;
1897   if (*matstruct) {
1898     if ((*matstruct)->mat) {
1899       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
1900         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
1901         stat = cusparseDestroyHybMat(hybMat);CHKERRCUDA(stat);
1902       } else {
1903         mat = (CsrMatrix*)(*matstruct)->mat;
1904         CsrMatrix_Destroy(&mat);
1905       }
1906     }
1907     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUDA(stat); }
1908     delete (*matstruct)->cprowIndices;
1909     if ((*matstruct)->alpha)     { err=cudaFree((*matstruct)->alpha);CHKERRCUDA(err); }
1910     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
1911     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
1912     delete *matstruct;
1913     *matstruct = 0;
1914   }
1915   PetscFunctionReturn(0);
1916 }
1917 
1918 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
1919 {
1920   cusparseHandle_t handle;
1921   cusparseStatus_t stat;
1922 
1923   PetscFunctionBegin;
1924   if (*trifactors) {
1925     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);
1926     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);
1927     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);
1928     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);
1929     delete (*trifactors)->rpermIndices;
1930     delete (*trifactors)->cpermIndices;
1931     delete (*trifactors)->workVector;
1932     if (handle = (*trifactors)->handle) {
1933       stat = cusparseDestroy(handle);CHKERRCUDA(stat);
1934     }
1935     delete *trifactors;
1936     *trifactors = 0;
1937   }
1938   PetscFunctionReturn(0);
1939 }
1940 
1941