xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 3ca0882ebe6e6bc3c96f853b3244799ed8a662a6)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 #define PETSC_SKIP_CXX_COMPLEX_FIX
7 
8 #include <petscconf.h>
9 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
10 #include <../src/mat/impls/sbaij/seq/sbaij.h>
11 #include <../src/vec/vec/impls/dvecimpl.h>
12 #include <petsc/private/vecimpl.h>
13 #undef VecType
14 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
15 
16 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
17 
18 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
19 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
20 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
21 
22 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
23 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
24 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
25 
26 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
27 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
28 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
29 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
30 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
31 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
32 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
33 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
34 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
35 
36 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
37 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
38 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
39 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
40 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
41 
42 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
43 {
44   cusparseStatus_t   stat;
45   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
46 
47   PetscFunctionBegin;
48   cusparsestruct->stream = stream;
49   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUDA(stat);
50   PetscFunctionReturn(0);
51 }
52 
53 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
54 {
55   cusparseStatus_t   stat;
56   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
57 
58   PetscFunctionBegin;
59   if (cusparsestruct->handle != handle) {
60     if (cusparsestruct->handle) {
61       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUDA(stat);
62     }
63     cusparsestruct->handle = handle;
64   }
65   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
66   PetscFunctionReturn(0);
67 }
68 
69 PetscErrorCode MatCUSPARSEClearHandle(Mat A)
70 {
71   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
72   PetscFunctionBegin;
73   if (cusparsestruct->handle)
74     cusparsestruct->handle = 0;
75   PetscFunctionReturn(0);
76 }
77 
78 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
79 {
80   PetscFunctionBegin;
81   *type = MATSOLVERCUSPARSE;
82   PetscFunctionReturn(0);
83 }
84 
85 /*MC
86   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
87   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
88   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
89   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
90   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
91   algorithms are not recommended. This class does NOT support direct solver operations.
92 
93   Level: beginner
94 
95 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
96 M*/
97 
98 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
99 {
100   PetscErrorCode ierr;
101   PetscInt       n = A->rmap->n;
102 
103   PetscFunctionBegin;
104   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
105   (*B)->factortype = ftype;
106   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
107   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
108 
109   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
110     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
111     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
112     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
113   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
114     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
115     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
116   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
117 
118   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
119   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
120   PetscFunctionReturn(0);
121 }
122 
123 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
124 {
125   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
126 
127   PetscFunctionBegin;
128   switch (op) {
129   case MAT_CUSPARSE_MULT:
130     cusparsestruct->format = format;
131     break;
132   case MAT_CUSPARSE_ALL:
133     cusparsestruct->format = format;
134     break;
135   default:
136     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
137   }
138   PetscFunctionReturn(0);
139 }
140 
141 /*@
142    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
143    operation. Only the MatMult operation can use different GPU storage formats
144    for MPIAIJCUSPARSE matrices.
145    Not Collective
146 
147    Input Parameters:
148 +  A - Matrix of type SEQAIJCUSPARSE
149 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
150 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
151 
152    Output Parameter:
153 
154    Level: intermediate
155 
156 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
157 @*/
158 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
159 {
160   PetscErrorCode ierr;
161 
162   PetscFunctionBegin;
163   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
164   ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
165   PetscFunctionReturn(0);
166 }
167 
168 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
169 {
170   PetscErrorCode           ierr;
171   MatCUSPARSEStorageFormat format;
172   PetscBool                flg;
173   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
174 
175   PetscFunctionBegin;
176   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
177   if (A->factortype==MAT_FACTOR_NONE) {
178     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
179                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
180     if (flg) {
181       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);
182     }
183   }
184   ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
185                           "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
186   if (flg) {
187     ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
188   }
189   ierr = PetscOptionsTail();CHKERRQ(ierr);
190   PetscFunctionReturn(0);
191 
192 }
193 
194 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
195 {
196   PetscErrorCode ierr;
197 
198   PetscFunctionBegin;
199   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
200   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
201   PetscFunctionReturn(0);
202 }
203 
204 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
205 {
206   PetscErrorCode ierr;
207 
208   PetscFunctionBegin;
209   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
210   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
211   PetscFunctionReturn(0);
212 }
213 
214 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
215 {
216   PetscErrorCode ierr;
217 
218   PetscFunctionBegin;
219   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
220   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
221   PetscFunctionReturn(0);
222 }
223 
224 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
225 {
226   PetscErrorCode ierr;
227 
228   PetscFunctionBegin;
229   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
230   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
231   PetscFunctionReturn(0);
232 }
233 
234 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
235 {
236   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
237   PetscInt                          n = A->rmap->n;
238   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
239   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
240   cusparseStatus_t                  stat;
241   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
242   const MatScalar                   *aa = a->a,*v;
243   PetscInt                          *AiLo, *AjLo;
244   PetscScalar                       *AALo;
245   PetscInt                          i,nz, nzLower, offset, rowOffset;
246   PetscErrorCode                    ierr;
247 
248   PetscFunctionBegin;
249   if (!n) PetscFunctionReturn(0);
250   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
251     try {
252       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
253       nzLower=n+ai[n]-ai[1];
254 
255       /* Allocate Space for the lower triangular matrix */
256       ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
257       ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(ierr);
258       ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(ierr);
259 
260       /* Fill the lower triangular matrix */
261       AiLo[0]  = (PetscInt) 0;
262       AiLo[n]  = nzLower;
263       AjLo[0]  = (PetscInt) 0;
264       AALo[0]  = (MatScalar) 1.0;
265       v        = aa;
266       vi       = aj;
267       offset   = 1;
268       rowOffset= 1;
269       for (i=1; i<n; i++) {
270         nz = ai[i+1] - ai[i];
271         /* additional 1 for the term on the diagonal */
272         AiLo[i]    = rowOffset;
273         rowOffset += nz+1;
274 
275         ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
276         ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
277 
278         offset      += nz;
279         AjLo[offset] = (PetscInt) i;
280         AALo[offset] = (MatScalar) 1.0;
281         offset      += 1;
282 
283         v  += nz;
284         vi += nz;
285       }
286 
287       /* allocate space for the triangular factor information */
288       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
289 
290       /* Create the matrix description */
291       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat);
292       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
293       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
294       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUDA(stat);
295       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat);
296 
297       /* Create the solve analysis information */
298       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat);
299 
300       /* set the operation */
301       loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
302 
303       /* set the matrix */
304       loTriFactor->csrMat = new CsrMatrix;
305       loTriFactor->csrMat->num_rows = n;
306       loTriFactor->csrMat->num_cols = n;
307       loTriFactor->csrMat->num_entries = nzLower;
308 
309       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
310       loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
311 
312       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
313       loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
314 
315       loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
316       loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
317 
318       /* perform the solve analysis */
319       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
320                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
321                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
322                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat);
323 
324       /* assign the pointer. Is this really necessary? */
325       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
326 
327       ierr = cudaFreeHost(AiLo);CHKERRCUDA(ierr);
328       ierr = cudaFreeHost(AjLo);CHKERRCUDA(ierr);
329       ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr);
330       ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
331     } catch(char *ex) {
332       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
333     }
334   }
335   PetscFunctionReturn(0);
336 }
337 
338 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
339 {
340   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
341   PetscInt                          n = A->rmap->n;
342   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
343   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
344   cusparseStatus_t                  stat;
345   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
346   const MatScalar                   *aa = a->a,*v;
347   PetscInt                          *AiUp, *AjUp;
348   PetscScalar                       *AAUp;
349   PetscInt                          i,nz, nzUpper, offset;
350   PetscErrorCode                    ierr;
351 
352   PetscFunctionBegin;
353   if (!n) PetscFunctionReturn(0);
354   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
355     try {
356       /* next, figure out the number of nonzeros in the upper triangular matrix. */
357       nzUpper = adiag[0]-adiag[n];
358 
359       /* Allocate Space for the upper triangular matrix */
360       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
361       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr);
362       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
363 
364       /* Fill the upper triangular matrix */
365       AiUp[0]=(PetscInt) 0;
366       AiUp[n]=nzUpper;
367       offset = nzUpper;
368       for (i=n-1; i>=0; i--) {
369         v  = aa + adiag[i+1] + 1;
370         vi = aj + adiag[i+1] + 1;
371 
372         /* number of elements NOT on the diagonal */
373         nz = adiag[i] - adiag[i+1]-1;
374 
375         /* decrement the offset */
376         offset -= (nz+1);
377 
378         /* first, set the diagonal elements */
379         AjUp[offset] = (PetscInt) i;
380         AAUp[offset] = (MatScalar)1./v[nz];
381         AiUp[i]      = AiUp[i+1] - (nz+1);
382 
383         ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
384         ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
385       }
386 
387       /* allocate space for the triangular factor information */
388       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
389 
390       /* Create the matrix description */
391       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat);
392       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
393       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
394       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
395       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat);
396 
397       /* Create the solve analysis information */
398       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat);
399 
400       /* set the operation */
401       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
402 
403       /* set the matrix */
404       upTriFactor->csrMat = new CsrMatrix;
405       upTriFactor->csrMat->num_rows = n;
406       upTriFactor->csrMat->num_cols = n;
407       upTriFactor->csrMat->num_entries = nzUpper;
408 
409       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
410       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
411 
412       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
413       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
414 
415       upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
416       upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
417 
418       /* perform the solve analysis */
419       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
420                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
421                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
422                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat);
423 
424       /* assign the pointer. Is this really necessary? */
425       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
426 
427       ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr);
428       ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr);
429       ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr);
430       ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
431     } catch(char *ex) {
432       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
433     }
434   }
435   PetscFunctionReturn(0);
436 }
437 
438 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
439 {
440   PetscErrorCode               ierr;
441   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
442   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
443   IS                           isrow = a->row,iscol = a->icol;
444   PetscBool                    row_identity,col_identity;
445   const PetscInt               *r,*c;
446   PetscInt                     n = A->rmap->n;
447 
448   PetscFunctionBegin;
449   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
450   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
451 
452   cusparseTriFactors->workVector = new THRUSTARRAY(n);
453   cusparseTriFactors->nnz=a->nz;
454 
455   A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH;
456   /*lower triangular indices */
457   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
458   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
459   if (!row_identity) {
460     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
461     cusparseTriFactors->rpermIndices->assign(r, r+n);
462   }
463   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
464 
465   /*upper triangular indices */
466   ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
467   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
468   if (!col_identity) {
469     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
470     cusparseTriFactors->cpermIndices->assign(c, c+n);
471   }
472 
473   if(!row_identity && !col_identity) {
474     ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr);
475   } else if(!row_identity) {
476     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
477   } else if(!col_identity) {
478     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
479   }
480 
481   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
482   PetscFunctionReturn(0);
483 }
484 
485 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
486 {
487   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
488   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
489   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
490   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
491   cusparseStatus_t                  stat;
492   PetscErrorCode                    ierr;
493   PetscInt                          *AiUp, *AjUp;
494   PetscScalar                       *AAUp;
495   PetscScalar                       *AALo;
496   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
497   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
498   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
499   const MatScalar                   *aa = b->a,*v;
500 
501   PetscFunctionBegin;
502   if (!n) PetscFunctionReturn(0);
503   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
504     try {
505       /* Allocate Space for the upper triangular matrix */
506       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
507       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr);
508       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
509       ierr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
510 
511       /* Fill the upper triangular matrix */
512       AiUp[0]=(PetscInt) 0;
513       AiUp[n]=nzUpper;
514       offset = 0;
515       for (i=0; i<n; i++) {
516         /* set the pointers */
517         v  = aa + ai[i];
518         vj = aj + ai[i];
519         nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
520 
521         /* first, set the diagonal elements */
522         AjUp[offset] = (PetscInt) i;
523         AAUp[offset] = (MatScalar)1.0/v[nz];
524         AiUp[i]      = offset;
525         AALo[offset] = (MatScalar)1.0/v[nz];
526 
527         offset+=1;
528         if (nz>0) {
529           ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
530           ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
531           for (j=offset; j<offset+nz; j++) {
532             AAUp[j] = -AAUp[j];
533             AALo[j] = AAUp[j]/v[nz];
534           }
535           offset+=nz;
536         }
537       }
538 
539       /* allocate space for the triangular factor information */
540       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
541 
542       /* Create the matrix description */
543       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat);
544       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
545       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
546       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
547       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat);
548 
549       /* Create the solve analysis information */
550       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat);
551 
552       /* set the operation */
553       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
554 
555       /* set the matrix */
556       upTriFactor->csrMat = new CsrMatrix;
557       upTriFactor->csrMat->num_rows = A->rmap->n;
558       upTriFactor->csrMat->num_cols = A->cmap->n;
559       upTriFactor->csrMat->num_entries = a->nz;
560 
561       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
562       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
563 
564       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
565       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
566 
567       upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
568       upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
569 
570       /* perform the solve analysis */
571       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
572                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
573                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
574                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat);
575 
576       /* assign the pointer. Is this really necessary? */
577       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
578 
579       /* allocate space for the triangular factor information */
580       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
581 
582       /* Create the matrix description */
583       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat);
584       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
585       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
586       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
587       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat);
588 
589       /* Create the solve analysis information */
590       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat);
591 
592       /* set the operation */
593       loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
594 
595       /* set the matrix */
596       loTriFactor->csrMat = new CsrMatrix;
597       loTriFactor->csrMat->num_rows = A->rmap->n;
598       loTriFactor->csrMat->num_cols = A->cmap->n;
599       loTriFactor->csrMat->num_entries = a->nz;
600 
601       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
602       loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
603 
604       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
605       loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
606 
607       loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
608       loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
609       ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
610 
611       /* perform the solve analysis */
612       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
613                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
614                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
615                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat);
616 
617       /* assign the pointer. Is this really necessary? */
618       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
619 
620       A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH;
621       ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr);
622       ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr);
623       ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr);
624       ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr);
625     } catch(char *ex) {
626       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
627     }
628   }
629   PetscFunctionReturn(0);
630 }
631 
632 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
633 {
634   PetscErrorCode               ierr;
635   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
636   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
637   IS                           ip = a->row;
638   const PetscInt               *rip;
639   PetscBool                    perm_identity;
640   PetscInt                     n = A->rmap->n;
641 
642   PetscFunctionBegin;
643   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
644   cusparseTriFactors->workVector = new THRUSTARRAY(n);
645   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
646 
647   /*lower triangular indices */
648   ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
649   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
650   if (!perm_identity) {
651     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
652     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
653     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
654     cusparseTriFactors->cpermIndices->assign(rip, rip+n);
655     ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr);
656   }
657   ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
658   PetscFunctionReturn(0);
659 }
660 
661 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
662 {
663   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
664   IS             isrow = b->row,iscol = b->col;
665   PetscBool      row_identity,col_identity;
666   PetscErrorCode ierr;
667 
668   PetscFunctionBegin;
669   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
670   /* determine which version of MatSolve needs to be used. */
671   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
672   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
673   if (row_identity && col_identity) {
674     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
675     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
676   } else {
677     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
678     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
679   }
680 
681   /* get the triangular factors */
682   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
683   PetscFunctionReturn(0);
684 }
685 
686 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
687 {
688   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
689   IS             ip = b->row;
690   PetscBool      perm_identity;
691   PetscErrorCode ierr;
692 
693   PetscFunctionBegin;
694   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
695 
696   /* determine which version of MatSolve needs to be used. */
697   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
698   if (perm_identity) {
699     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
700     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
701   } else {
702     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
703     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
704   }
705 
706   /* get the triangular factors */
707   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
708   PetscFunctionReturn(0);
709 }
710 
711 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
712 {
713   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
714   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
715   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
716   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
717   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
718   cusparseStatus_t                  stat;
719   cusparseIndexBase_t               indexBase;
720   cusparseMatrixType_t              matrixType;
721   cusparseFillMode_t                fillMode;
722   cusparseDiagType_t                diagType;
723 
724   PetscFunctionBegin;
725 
726   /*********************************************/
727   /* Now the Transpose of the Lower Tri Factor */
728   /*********************************************/
729 
730   /* allocate space for the transpose of the lower triangular factor */
731   loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
732 
733   /* set the matrix descriptors of the lower triangular factor */
734   matrixType = cusparseGetMatType(loTriFactor->descr);
735   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
736   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
737     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
738   diagType = cusparseGetMatDiagType(loTriFactor->descr);
739 
740   /* Create the matrix description */
741   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUDA(stat);
742   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUDA(stat);
743   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUDA(stat);
744   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUDA(stat);
745   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUDA(stat);
746 
747   /* Create the solve analysis information */
748   stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUDA(stat);
749 
750   /* set the operation */
751   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
752 
753   /* allocate GPU space for the CSC of the lower triangular factor*/
754   loTriFactorT->csrMat = new CsrMatrix;
755   loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows;
756   loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols;
757   loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
758   loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1);
759   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries);
760   loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries);
761 
762   /* compute the transpose of the lower triangular factor, i.e. the CSC */
763   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
764                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
765                           loTriFactor->csrMat->values->data().get(),
766                           loTriFactor->csrMat->row_offsets->data().get(),
767                           loTriFactor->csrMat->column_indices->data().get(),
768                           loTriFactorT->csrMat->values->data().get(),
769                           loTriFactorT->csrMat->column_indices->data().get(),
770                           loTriFactorT->csrMat->row_offsets->data().get(),
771                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
772 
773   /* perform the solve analysis on the transposed matrix */
774   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
775                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries,
776                            loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
777                            loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(),
778                            loTriFactorT->solveInfo);CHKERRCUDA(stat);
779 
780   /* assign the pointer. Is this really necessary? */
781   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
782 
783   /*********************************************/
784   /* Now the Transpose of the Upper Tri Factor */
785   /*********************************************/
786 
787   /* allocate space for the transpose of the upper triangular factor */
788   upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
789 
790   /* set the matrix descriptors of the upper triangular factor */
791   matrixType = cusparseGetMatType(upTriFactor->descr);
792   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
793   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
794     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
795   diagType = cusparseGetMatDiagType(upTriFactor->descr);
796 
797   /* Create the matrix description */
798   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUDA(stat);
799   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUDA(stat);
800   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUDA(stat);
801   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUDA(stat);
802   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUDA(stat);
803 
804   /* Create the solve analysis information */
805   stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUDA(stat);
806 
807   /* set the operation */
808   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
809 
810   /* allocate GPU space for the CSC of the upper triangular factor*/
811   upTriFactorT->csrMat = new CsrMatrix;
812   upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows;
813   upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols;
814   upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
815   upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1);
816   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries);
817   upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries);
818 
819   /* compute the transpose of the upper triangular factor, i.e. the CSC */
820   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
821                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
822                           upTriFactor->csrMat->values->data().get(),
823                           upTriFactor->csrMat->row_offsets->data().get(),
824                           upTriFactor->csrMat->column_indices->data().get(),
825                           upTriFactorT->csrMat->values->data().get(),
826                           upTriFactorT->csrMat->column_indices->data().get(),
827                           upTriFactorT->csrMat->row_offsets->data().get(),
828                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
829 
830   /* perform the solve analysis on the transposed matrix */
831   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
832                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries,
833                            upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
834                            upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(),
835                            upTriFactorT->solveInfo);CHKERRCUDA(stat);
836 
837   /* assign the pointer. Is this really necessary? */
838   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
839   PetscFunctionReturn(0);
840 }
841 
842 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
843 {
844   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
845   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
846   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
847   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
848   cusparseStatus_t             stat;
849   cusparseIndexBase_t          indexBase;
850   cudaError_t                  err;
851   PetscErrorCode               ierr;
852 
853   PetscFunctionBegin;
854 
855   /* allocate space for the triangular factor information */
856   matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
857   stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUDA(stat);
858   indexBase = cusparseGetMatIndexBase(matstruct->descr);
859   stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUDA(stat);
860   stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat);
861 
862   /* set alpha and beta */
863   err = cudaMalloc((void **)&(matstructT->alpha),    sizeof(PetscScalar));CHKERRCUDA(err);
864   err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
865   err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
866   err = cudaMemcpy(matstructT->alpha,    &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
867   err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
868   err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
869   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
870 
871   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
872     CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
873     CsrMatrix *matrixT= new CsrMatrix;
874     thrust::device_vector<int> *rowoffsets_gpu;
875     const int ncomprow = matstruct->cprowIndices->size();
876     thrust::host_vector<int> cprow(ncomprow);
877     cprow = *matstruct->cprowIndices; // GPU --> CPU
878     matrixT->num_rows = A->cmap->n;
879     matrixT->num_cols = A->rmap->n;
880     matrixT->num_entries = a->nz;
881     matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
882     matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
883     matrixT->values = new THRUSTARRAY(a->nz);
884     PetscInt k,i;
885     thrust::host_vector<int> rowst_host(A->rmap->n+1);
886 
887     /* expand compress rows, which is forced in constructor (MatSeqAIJCUSPARSECopyToGPU) */
888     rowst_host[0] = 0;
889     for (k = 0, i = 0; i < A->rmap->n ; i++) {
890       if (k < ncomprow && i==cprow[k]) {
891 	rowst_host[i+1] = a->i[i+1];
892 	k++;
893       } else rowst_host[i+1] = rowst_host[i];
894     }
895     if (k!=ncomprow) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"%D != %D",k,ncomprow);
896     rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
897     *rowoffsets_gpu = rowst_host; // CPU --> GPU
898 
899     /* compute the transpose of the upper triangular factor, i.e. the CSC */
900     indexBase = cusparseGetMatIndexBase(matstruct->descr);
901     stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
902                             A->cmap->n, matrix->num_entries,
903                             matrix->values->data().get(),
904                             rowoffsets_gpu->data().get(),
905                             matrix->column_indices->data().get(),
906                             matrixT->values->data().get(),
907                             matrixT->column_indices->data().get(),
908                             matrixT->row_offsets->data().get(),
909                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
910 
911     /* assign the pointer */
912     matstructT->mat = matrixT;
913     ierr = PetscLogCpuToGpu(((A->rmap->n+1)+(a->nz))*sizeof(int)+(3+a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
914   } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
915     /* First convert HYB to CSR */
916     CsrMatrix *temp= new CsrMatrix;
917     temp->num_rows = A->rmap->n;
918     temp->num_cols = A->cmap->n;
919     temp->num_entries = a->nz;
920     temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
921     temp->column_indices = new THRUSTINTARRAY32(a->nz);
922     temp->values = new THRUSTARRAY(a->nz);
923 
924 
925     stat = cusparse_hyb2csr(cusparsestruct->handle,
926                             matstruct->descr, (cusparseHybMat_t)matstruct->mat,
927                             temp->values->data().get(),
928                             temp->row_offsets->data().get(),
929                             temp->column_indices->data().get());CHKERRCUDA(stat);
930 
931     /* Next, convert CSR to CSC (i.e. the matrix transpose) */
932     CsrMatrix *tempT= new CsrMatrix;
933     tempT->num_rows = A->rmap->n;
934     tempT->num_cols = A->cmap->n;
935     tempT->num_entries = a->nz;
936     tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
937     tempT->column_indices = new THRUSTINTARRAY32(a->nz);
938     tempT->values = new THRUSTARRAY(a->nz);
939 
940     stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
941                             temp->num_cols, temp->num_entries,
942                             temp->values->data().get(),
943                             temp->row_offsets->data().get(),
944                             temp->column_indices->data().get(),
945                             tempT->values->data().get(),
946                             tempT->column_indices->data().get(),
947                             tempT->row_offsets->data().get(),
948                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
949 
950     /* Last, convert CSC to HYB */
951     cusparseHybMat_t hybMat;
952     stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat);
953     cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
954       CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
955     stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
956                             matstructT->descr, tempT->values->data().get(),
957                             tempT->row_offsets->data().get(),
958                             tempT->column_indices->data().get(),
959                             hybMat, 0, partition);CHKERRCUDA(stat);
960 
961     /* assign the pointer */
962     matstructT->mat = hybMat;
963     ierr = PetscLogCpuToGpu((2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)))+3*sizeof(PetscScalar));CHKERRQ(ierr);
964 
965     /* delete temporaries */
966     if (tempT) {
967       if (tempT->values) delete (THRUSTARRAY*) tempT->values;
968       if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
969       if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
970       delete (CsrMatrix*) tempT;
971     }
972     if (temp) {
973       if (temp->values) delete (THRUSTARRAY*) temp->values;
974       if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
975       if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
976       delete (CsrMatrix*) temp;
977     }
978   }
979   /* assign the compressed row indices */
980   matstructT->cprowIndices = new THRUSTINTARRAY;
981   matstructT->cprowIndices->resize(A->cmap->n);
982   thrust::sequence(matstructT->cprowIndices->begin(), matstructT->cprowIndices->end());
983   /* assign the pointer */
984   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
985   PetscFunctionReturn(0);
986 }
987 
988 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
989 {
990   PetscInt                              n = xx->map->n;
991   const PetscScalar                     *barray;
992   PetscScalar                           *xarray;
993   thrust::device_ptr<const PetscScalar> bGPU;
994   thrust::device_ptr<PetscScalar>       xGPU;
995   cusparseStatus_t                      stat;
996   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
997   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
998   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
999   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1000   PetscErrorCode                        ierr;
1001 
1002   PetscFunctionBegin;
1003   /* Analyze the matrix and create the transpose ... on the fly */
1004   if (!loTriFactorT && !upTriFactorT) {
1005     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1006     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1007     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1008   }
1009 
1010   /* Get the GPU pointers */
1011   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1012   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1013   xGPU = thrust::device_pointer_cast(xarray);
1014   bGPU = thrust::device_pointer_cast(barray);
1015 
1016   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1017   /* First, reorder with the row permutation */
1018   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1019                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1020                xGPU);
1021 
1022   /* First, solve U */
1023   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1024                         upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1025                         upTriFactorT->csrMat->values->data().get(),
1026                         upTriFactorT->csrMat->row_offsets->data().get(),
1027                         upTriFactorT->csrMat->column_indices->data().get(),
1028                         upTriFactorT->solveInfo,
1029                         xarray, tempGPU->data().get());CHKERRCUDA(stat);
1030 
1031   /* Then, solve L */
1032   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1033                         loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1034                         loTriFactorT->csrMat->values->data().get(),
1035                         loTriFactorT->csrMat->row_offsets->data().get(),
1036                         loTriFactorT->csrMat->column_indices->data().get(),
1037                         loTriFactorT->solveInfo,
1038                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1039 
1040   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1041   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1042                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1043                tempGPU->begin());
1044 
1045   /* Copy the temporary to the full solution. */
1046   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1047 
1048   /* restore */
1049   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1050   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1051   ierr = WaitForGPU();CHKERRCUDA(ierr);
1052   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1053   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1054   PetscFunctionReturn(0);
1055 }
1056 
1057 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1058 {
1059   const PetscScalar                 *barray;
1060   PetscScalar                       *xarray;
1061   cusparseStatus_t                  stat;
1062   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1063   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1064   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1065   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1066   PetscErrorCode                    ierr;
1067 
1068   PetscFunctionBegin;
1069   /* Analyze the matrix and create the transpose ... on the fly */
1070   if (!loTriFactorT && !upTriFactorT) {
1071     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1072     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1073     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1074   }
1075 
1076   /* Get the GPU pointers */
1077   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1078   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1079 
1080   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1081   /* First, solve U */
1082   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1083                         upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1084                         upTriFactorT->csrMat->values->data().get(),
1085                         upTriFactorT->csrMat->row_offsets->data().get(),
1086                         upTriFactorT->csrMat->column_indices->data().get(),
1087                         upTriFactorT->solveInfo,
1088                         barray, tempGPU->data().get());CHKERRCUDA(stat);
1089 
1090   /* Then, solve L */
1091   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1092                         loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1093                         loTriFactorT->csrMat->values->data().get(),
1094                         loTriFactorT->csrMat->row_offsets->data().get(),
1095                         loTriFactorT->csrMat->column_indices->data().get(),
1096                         loTriFactorT->solveInfo,
1097                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1098 
1099   /* restore */
1100   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1101   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1102   ierr = WaitForGPU();CHKERRCUDA(ierr);
1103   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1104   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1105   PetscFunctionReturn(0);
1106 }
1107 
1108 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1109 {
1110   const PetscScalar                     *barray;
1111   PetscScalar                           *xarray;
1112   thrust::device_ptr<const PetscScalar> bGPU;
1113   thrust::device_ptr<PetscScalar>       xGPU;
1114   cusparseStatus_t                      stat;
1115   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1116   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1117   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1118   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1119   PetscErrorCode                        ierr;
1120 
1121   PetscFunctionBegin;
1122 
1123   /* Get the GPU pointers */
1124   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1125   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1126   xGPU = thrust::device_pointer_cast(xarray);
1127   bGPU = thrust::device_pointer_cast(barray);
1128 
1129   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1130   /* First, reorder with the row permutation */
1131   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1132                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1133                xGPU);
1134 
1135   /* Next, solve L */
1136   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1137                         loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1138                         loTriFactor->csrMat->values->data().get(),
1139                         loTriFactor->csrMat->row_offsets->data().get(),
1140                         loTriFactor->csrMat->column_indices->data().get(),
1141                         loTriFactor->solveInfo,
1142                         xarray, tempGPU->data().get());CHKERRCUDA(stat);
1143 
1144   /* Then, solve U */
1145   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1146                         upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1147                         upTriFactor->csrMat->values->data().get(),
1148                         upTriFactor->csrMat->row_offsets->data().get(),
1149                         upTriFactor->csrMat->column_indices->data().get(),
1150                         upTriFactor->solveInfo,
1151                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1152 
1153   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1154   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1155                thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->end()),
1156                tempGPU->begin());
1157 
1158   /* Copy the temporary to the full solution. */
1159   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1160 
1161   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1162   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1163   ierr = WaitForGPU();CHKERRCUDA(ierr);
1164   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1165   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1166   PetscFunctionReturn(0);
1167 }
1168 
1169 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1170 {
1171   const PetscScalar                 *barray;
1172   PetscScalar                       *xarray;
1173   cusparseStatus_t                  stat;
1174   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1175   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1176   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1177   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1178   PetscErrorCode                    ierr;
1179 
1180   PetscFunctionBegin;
1181   /* Get the GPU pointers */
1182   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1183   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1184 
1185   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1186   /* First, solve L */
1187   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1188                         loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1189                         loTriFactor->csrMat->values->data().get(),
1190                         loTriFactor->csrMat->row_offsets->data().get(),
1191                         loTriFactor->csrMat->column_indices->data().get(),
1192                         loTriFactor->solveInfo,
1193                         barray, tempGPU->data().get());CHKERRCUDA(stat);
1194 
1195   /* Next, solve U */
1196   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1197                         upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1198                         upTriFactor->csrMat->values->data().get(),
1199                         upTriFactor->csrMat->row_offsets->data().get(),
1200                         upTriFactor->csrMat->column_indices->data().get(),
1201                         upTriFactor->solveInfo,
1202                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1203 
1204   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1205   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1206   ierr = WaitForGPU();CHKERRCUDA(ierr);
1207   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1208   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1209   PetscFunctionReturn(0);
1210 }
1211 
1212 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1213 {
1214   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1215   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1216   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1217   PetscInt                     m = A->rmap->n,*ii,*ridx;
1218   PetscErrorCode               ierr;
1219   cusparseStatus_t             stat;
1220   cudaError_t                  err;
1221 
1222   PetscFunctionBegin;
1223   if (A->pinnedtocpu) PetscFunctionReturn(0);
1224   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
1225     ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1226     if (A->assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) {
1227       CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
1228       /* copy values only */
1229       matrix->values->assign(a->a, a->a+a->nz);
1230       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
1231     } else {
1232       MatSeqAIJCUSPARSEMultStruct_Destroy(&matstruct,cusparsestruct->format);
1233       try {
1234         cusparsestruct->nonzerorow=0;
1235         for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0);
1236 
1237         if (a->compressedrow.use) {
1238           m    = a->compressedrow.nrows;
1239           ii   = a->compressedrow.i;
1240           ridx = a->compressedrow.rindex;
1241         } else {
1242           /* Forcing compressed row on the GPU */
1243           int k=0;
1244           ierr = PetscMalloc1(cusparsestruct->nonzerorow+1, &ii);CHKERRQ(ierr);
1245           ierr = PetscMalloc1(cusparsestruct->nonzerorow, &ridx);CHKERRQ(ierr);
1246           ii[0]=0;
1247           for (int j = 0; j<m; j++) {
1248             if ((a->i[j+1]-a->i[j])>0) {
1249               ii[k]  = a->i[j];
1250               ridx[k]= j;
1251               k++;
1252             }
1253           }
1254           ii[cusparsestruct->nonzerorow] = a->nz;
1255           m = cusparsestruct->nonzerorow;
1256         }
1257 
1258         /* allocate space for the triangular factor information */
1259         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1260         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUDA(stat);
1261         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
1262         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat);
1263 
1264         err = cudaMalloc((void **)&(matstruct->alpha),    sizeof(PetscScalar));CHKERRCUDA(err);
1265         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1266         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1267         err = cudaMemcpy(matstruct->alpha,    &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1268         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1269         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1270         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
1271 
1272         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1273         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1274           /* set the matrix */
1275           CsrMatrix *matrix= new CsrMatrix;
1276           matrix->num_rows = m;
1277           matrix->num_cols = A->cmap->n;
1278           matrix->num_entries = a->nz;
1279           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1280           matrix->row_offsets->assign(ii, ii + m+1);
1281 
1282           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1283           matrix->column_indices->assign(a->j, a->j+a->nz);
1284 
1285           matrix->values = new THRUSTARRAY(a->nz);
1286           matrix->values->assign(a->a, a->a+a->nz);
1287 
1288           /* assign the pointer */
1289           matstruct->mat = matrix;
1290 
1291         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1292           CsrMatrix *matrix= new CsrMatrix;
1293           matrix->num_rows = m;
1294           matrix->num_cols = A->cmap->n;
1295           matrix->num_entries = a->nz;
1296           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1297           matrix->row_offsets->assign(ii, ii + m+1);
1298 
1299           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1300           matrix->column_indices->assign(a->j, a->j+a->nz);
1301 
1302           matrix->values = new THRUSTARRAY(a->nz);
1303           matrix->values->assign(a->a, a->a+a->nz);
1304 
1305           cusparseHybMat_t hybMat;
1306           stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat);
1307           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1308             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1309           stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols,
1310               matstruct->descr, matrix->values->data().get(),
1311               matrix->row_offsets->data().get(),
1312               matrix->column_indices->data().get(),
1313               hybMat, 0, partition);CHKERRCUDA(stat);
1314           /* assign the pointer */
1315           matstruct->mat = hybMat;
1316 
1317           if (matrix) {
1318             if (matrix->values) delete (THRUSTARRAY*)matrix->values;
1319             if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices;
1320             if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets;
1321             delete (CsrMatrix*)matrix;
1322           }
1323         }
1324 
1325         /* assign the compressed row indices */
1326         matstruct->cprowIndices = new THRUSTINTARRAY(m);
1327         matstruct->cprowIndices->assign(ridx,ridx+m);
1328         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+m*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
1329 
1330         /* assign the pointer */
1331         cusparsestruct->mat = matstruct;
1332 
1333         if (!a->compressedrow.use) {
1334           ierr = PetscFree(ii);CHKERRQ(ierr);
1335           ierr = PetscFree(ridx);CHKERRQ(ierr);
1336         }
1337         cusparsestruct->workVector = new THRUSTARRAY(m);
1338       } catch(char *ex) {
1339         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1340       }
1341       cusparsestruct->nonzerostate = A->nonzerostate;
1342     }
1343     ierr = WaitForGPU();CHKERRCUDA(ierr);
1344     A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH;
1345     ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1346   }
1347   PetscFunctionReturn(0);
1348 }
1349 
1350 struct VecCUDAPlusEquals
1351 {
1352   template <typename Tuple>
1353   __host__ __device__
1354   void operator()(Tuple t)
1355   {
1356     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1357   }
1358 };
1359 
1360 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1361 {
1362   PetscErrorCode ierr;
1363 
1364   PetscFunctionBegin;
1365   ierr = MatMultAdd_SeqAIJCUSPARSE(A,xx,NULL,yy);CHKERRQ(ierr);
1366   PetscFunctionReturn(0);
1367 }
1368 
1369 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1370 {
1371   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1372   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1373   Mat_SeqAIJCUSPARSEMultStruct *matstructT;
1374   const PetscScalar            *xarray;
1375   PetscScalar                  *yarray;
1376   PetscErrorCode               ierr;
1377   cusparseStatus_t             stat;
1378 
1379   PetscFunctionBegin;
1380   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1381   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1382   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1383   if (!matstructT) {
1384     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1385     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1386   }
1387   ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1388   ierr = VecCUDAGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
1389 
1390   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1391   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1392     CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1393     stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1394                              mat->num_rows, mat->num_cols,
1395                              mat->num_entries, matstructT->alpha, matstructT->descr,
1396                              mat->values->data().get(), mat->row_offsets->data().get(),
1397                              mat->column_indices->data().get(), xarray, matstructT->beta_zero,
1398                              yarray);CHKERRCUDA(stat);
1399   } else {
1400     cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1401     stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1402                              matstructT->alpha, matstructT->descr, hybMat,
1403                              xarray, matstructT->beta_zero,
1404                              yarray);CHKERRCUDA(stat);
1405   }
1406   ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1407   ierr = VecCUDARestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
1408   ierr = WaitForGPU();CHKERRCUDA(ierr);
1409   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1410   ierr = PetscLogGpuFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr);
1411   PetscFunctionReturn(0);
1412 }
1413 
1414 
1415 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1416 {
1417   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1418   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1419   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
1420   const PetscScalar            *xarray;
1421   PetscScalar                  *zarray,*dptr,*beta;
1422   PetscErrorCode               ierr;
1423   cusparseStatus_t             stat;
1424   PetscBool                    cmpr; /* if the matrix has been compressed (zero rows) */
1425 
1426   PetscFunctionBegin;
1427   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1428   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1429   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1430   try {
1431     cmpr = (PetscBool)(cusparsestruct->workVector->size() == (thrust::detail::vector_base<PetscScalar, thrust::device_malloc_allocator<PetscScalar> >::size_type)(A->rmap->n));
1432     ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1433     if (yy && !cmpr) { /* MatMultAdd with noncompressed storage -> need uptodate zz vector */
1434       ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);
1435     } else {
1436       ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
1437     }
1438     dptr = cmpr ? zarray : cusparsestruct->workVector->data().get();
1439     beta = (yy == zz && dptr == zarray) ? matstruct->beta_one : matstruct->beta_zero;
1440 
1441     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1442     /* csr_spmv is multiply add */
1443     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1444       /* here we need to be careful to set the number of rows in the multiply to the
1445          number of compressed rows in the matrix ... which is equivalent to the
1446          size of the workVector */
1447       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1448       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1449                                mat->num_rows, mat->num_cols,
1450                                mat->num_entries, matstruct->alpha, matstruct->descr,
1451                                mat->values->data().get(), mat->row_offsets->data().get(),
1452                                mat->column_indices->data().get(), xarray, beta,
1453                                dptr);CHKERRCUDA(stat);
1454     } else {
1455       if (cusparsestruct->workVector->size()) {
1456 	cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1457         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1458                                  matstruct->alpha, matstruct->descr, hybMat,
1459                                  xarray, beta,
1460                                  dptr);CHKERRCUDA(stat);
1461       }
1462     }
1463     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1464 
1465     if (yy) {
1466       if (dptr != zarray) {
1467         ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);
1468       } else if (zz != yy) {
1469         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr);
1470       }
1471     } else if (dptr != zarray) {
1472       ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
1473     }
1474     /* scatter the data from the temporary into the full vector with a += operation */
1475     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1476     if (dptr != zarray) {
1477       thrust::device_ptr<PetscScalar> zptr;
1478 
1479       zptr = thrust::device_pointer_cast(zarray);
1480       thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
1481                        thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(),
1482                        VecCUDAPlusEquals());
1483     }
1484     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1485     ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1486     if (yy && !cmpr) {
1487       ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);
1488     } else {
1489       ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
1490     }
1491   } catch(char *ex) {
1492     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1493   }
1494   if (!yy) { /* MatMult */
1495     if (!cusparsestruct->stream) {
1496       ierr = WaitForGPU();CHKERRCUDA(ierr);
1497     }
1498   }
1499   ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
1500   PetscFunctionReturn(0);
1501 }
1502 
1503 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1504 {
1505   Mat_SeqAIJ                      *a = (Mat_SeqAIJ*)A->data;
1506   Mat_SeqAIJCUSPARSE              *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1507   Mat_SeqAIJCUSPARSEMultStruct    *matstructT;
1508   const PetscScalar               *xarray;
1509   PetscScalar                     *zarray,*dptr,*beta;
1510   PetscErrorCode                  ierr;
1511   cusparseStatus_t                stat;
1512 
1513   PetscFunctionBegin;
1514   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1515   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1516   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1517   if (!matstructT) {
1518     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1519     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1520   }
1521 
1522   try {
1523     ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);
1524     ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1525     ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);
1526     dptr = cusparsestruct->workVector->size() == (thrust::detail::vector_base<PetscScalar, thrust::device_malloc_allocator<PetscScalar> >::size_type)(A->cmap->n) ? zarray : cusparsestruct->workVector->data().get();
1527     beta = (yy == zz && dptr == zarray) ? matstructT->beta_one : matstructT->beta_zero;
1528 
1529     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1530     /* multiply add with matrix transpose */
1531     if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1532       CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1533       /* here we need to be careful to set the number of rows in the multiply to the
1534          number of compressed rows in the matrix ... which is equivalent to the
1535          size of the workVector */
1536       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1537                                mat->num_rows, mat->num_cols,
1538                                mat->num_entries, matstructT->alpha, matstructT->descr,
1539                                mat->values->data().get(), mat->row_offsets->data().get(),
1540                                mat->column_indices->data().get(), xarray, beta,
1541                                dptr);CHKERRCUDA(stat);
1542     } else {
1543       cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1544       if (cusparsestruct->workVector->size()) {
1545         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1546                                  matstructT->alpha, matstructT->descr, hybMat,
1547                                  xarray, beta,
1548                                  dptr);CHKERRCUDA(stat);
1549       }
1550     }
1551     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1552 
1553     if (dptr != zarray) {
1554       ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);
1555     } else if (zz != yy) {
1556       ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr);
1557     }
1558     /* scatter the data from the temporary into the full vector with a += operation */
1559     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1560     if (dptr != zarray) {
1561       thrust::device_ptr<PetscScalar> zptr;
1562 
1563       zptr = thrust::device_pointer_cast(zarray);
1564 
1565       /* scatter the data from the temporary into the full vector with a += operation */
1566       thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))),
1567                        thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))) + A->cmap->n,
1568                        VecCUDAPlusEquals());
1569     }
1570     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1571     ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1572     ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);
1573   } catch(char *ex) {
1574     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1575   }
1576   ierr = WaitForGPU();CHKERRCUDA(ierr); /* is this needed? just for yy==0 in Mult */
1577   ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
1578   PetscFunctionReturn(0);
1579 }
1580 
1581 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
1582 {
1583   PetscErrorCode ierr;
1584 
1585   PetscFunctionBegin;
1586   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
1587   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
1588   if (A->factortype == MAT_FACTOR_NONE) {
1589     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1590   }
1591   A->ops->mult             = MatMult_SeqAIJCUSPARSE;
1592   A->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1593   A->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1594   A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1595   PetscFunctionReturn(0);
1596 }
1597 
1598 /* --------------------------------------------------------------------------------*/
1599 /*@
1600    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
1601    (the default parallel PETSc format). This matrix will ultimately pushed down
1602    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
1603    assembly performance the user should preallocate the matrix storage by setting
1604    the parameter nz (or the array nnz).  By setting these parameters accurately,
1605    performance during matrix assembly can be increased by more than a factor of 50.
1606 
1607    Collective
1608 
1609    Input Parameters:
1610 +  comm - MPI communicator, set to PETSC_COMM_SELF
1611 .  m - number of rows
1612 .  n - number of columns
1613 .  nz - number of nonzeros per row (same for all rows)
1614 -  nnz - array containing the number of nonzeros in the various rows
1615          (possibly different for each row) or NULL
1616 
1617    Output Parameter:
1618 .  A - the matrix
1619 
1620    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1621    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1622    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
1623 
1624    Notes:
1625    If nnz is given then nz is ignored
1626 
1627    The AIJ format (also called the Yale sparse matrix format or
1628    compressed row storage), is fully compatible with standard Fortran 77
1629    storage.  That is, the stored row and column indices can begin at
1630    either one (as in Fortran) or zero.  See the users' manual for details.
1631 
1632    Specify the preallocated storage with either nz or nnz (not both).
1633    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1634    allocation.  For large problems you MUST preallocate memory or you
1635    will get TERRIBLE performance, see the users' manual chapter on matrices.
1636 
1637    By default, this format uses inodes (identical nodes) when possible, to
1638    improve numerical efficiency of matrix-vector products and solves. We
1639    search for consecutive rows with the same nonzero structure, thereby
1640    reusing matrix information to achieve increased efficiency.
1641 
1642    Level: intermediate
1643 
1644 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
1645 @*/
1646 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1647 {
1648   PetscErrorCode ierr;
1649 
1650   PetscFunctionBegin;
1651   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1652   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
1653   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1654   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
1655   PetscFunctionReturn(0);
1656 }
1657 
1658 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
1659 {
1660   PetscErrorCode   ierr;
1661 
1662   PetscFunctionBegin;
1663   if (A->factortype==MAT_FACTOR_NONE) {
1664     if (A->valid_GPU_matrix != PETSC_OFFLOAD_UNALLOCATED) {
1665       ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
1666     }
1667   } else {
1668     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
1669   }
1670   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
1671   PetscFunctionReturn(0);
1672 }
1673 
1674 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
1675 {
1676   PetscErrorCode ierr;
1677   Mat C;
1678   cusparseStatus_t stat;
1679   cusparseHandle_t handle=0;
1680 
1681   PetscFunctionBegin;
1682   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
1683   C    = *B;
1684   ierr = PetscFree(C->defaultvectype);CHKERRQ(ierr);
1685   ierr = PetscStrallocpy(VECCUDA,&C->defaultvectype);CHKERRQ(ierr);
1686 
1687   /* inject CUSPARSE-specific stuff */
1688   if (C->factortype==MAT_FACTOR_NONE) {
1689     /* you cannot check the inode.use flag here since the matrix was just created.
1690        now build a GPU matrix data structure */
1691     C->spptr = new Mat_SeqAIJCUSPARSE;
1692     ((Mat_SeqAIJCUSPARSE*)C->spptr)->mat          = 0;
1693     ((Mat_SeqAIJCUSPARSE*)C->spptr)->matTranspose = 0;
1694     ((Mat_SeqAIJCUSPARSE*)C->spptr)->workVector   = 0;
1695     ((Mat_SeqAIJCUSPARSE*)C->spptr)->format       = MAT_CUSPARSE_CSR;
1696     ((Mat_SeqAIJCUSPARSE*)C->spptr)->stream       = 0;
1697     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1698     ((Mat_SeqAIJCUSPARSE*)C->spptr)->handle       = handle;
1699     ((Mat_SeqAIJCUSPARSE*)C->spptr)->nonzerostate = 0;
1700   } else {
1701     /* NEXT, set the pointers to the triangular factors */
1702     C->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1703     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtr          = 0;
1704     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtr          = 0;
1705     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtrTranspose = 0;
1706     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtrTranspose = 0;
1707     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->rpermIndices            = 0;
1708     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->cpermIndices            = 0;
1709     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->workVector              = 0;
1710     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle                  = 0;
1711     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1712     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle                  = handle;
1713     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->nnz                     = 0;
1714   }
1715 
1716   C->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
1717   C->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
1718   C->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
1719   C->ops->mult             = MatMult_SeqAIJCUSPARSE;
1720   C->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1721   C->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1722   C->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1723   C->ops->duplicate        = MatDuplicate_SeqAIJCUSPARSE;
1724 
1725   ierr = PetscObjectChangeTypeName((PetscObject)C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1726 
1727   C->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED;
1728 
1729   ierr = PetscObjectComposeFunction((PetscObject)C, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
1730   PetscFunctionReturn(0);
1731 }
1732 
1733 PETSC_EXTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat B)
1734 {
1735   PetscErrorCode ierr;
1736   cusparseStatus_t stat;
1737   cusparseHandle_t handle=0;
1738 
1739   PetscFunctionBegin;
1740   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
1741   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
1742 
1743   if (B->factortype==MAT_FACTOR_NONE) {
1744     /* you cannot check the inode.use flag here since the matrix was just created.
1745        now build a GPU matrix data structure */
1746     B->spptr = new Mat_SeqAIJCUSPARSE;
1747     ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat          = 0;
1748     ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0;
1749     ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector   = 0;
1750     ((Mat_SeqAIJCUSPARSE*)B->spptr)->format       = MAT_CUSPARSE_CSR;
1751     ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream       = 0;
1752     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1753     ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle       = handle;
1754     ((Mat_SeqAIJCUSPARSE*)B->spptr)->nonzerostate = 0;
1755   } else {
1756     /* NEXT, set the pointers to the triangular factors */
1757     B->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1758     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr          = 0;
1759     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr          = 0;
1760     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0;
1761     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0;
1762     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices            = 0;
1763     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices            = 0;
1764     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector              = 0;
1765     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1766     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle                  = handle;
1767     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz                     = 0;
1768   }
1769 
1770   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
1771   B->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
1772   B->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
1773   B->ops->mult             = MatMult_SeqAIJCUSPARSE;
1774   B->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1775   B->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1776   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1777   B->ops->duplicate        = MatDuplicate_SeqAIJCUSPARSE;
1778 
1779   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1780 
1781   B->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED;
1782 
1783   ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
1784   PetscFunctionReturn(0);
1785 }
1786 
1787 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
1788 {
1789   PetscErrorCode ierr;
1790 
1791   PetscFunctionBegin;
1792   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
1793   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B);CHKERRQ(ierr);
1794   PetscFunctionReturn(0);
1795 }
1796 
1797 /*MC
1798    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
1799 
1800    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
1801    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
1802    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
1803 
1804    Options Database Keys:
1805 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
1806 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1807 -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1808 
1809   Level: beginner
1810 
1811 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
1812 M*/
1813 
1814 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);
1815 
1816 
1817 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
1818 {
1819   PetscErrorCode ierr;
1820 
1821   PetscFunctionBegin;
1822   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1823   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1824   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1825   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1826   PetscFunctionReturn(0);
1827 }
1828 
1829 
1830 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
1831 {
1832   cusparseStatus_t stat;
1833   cusparseHandle_t handle;
1834 
1835   PetscFunctionBegin;
1836   if (*cusparsestruct) {
1837     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);
1838     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);
1839     delete (*cusparsestruct)->workVector;
1840     if (handle = (*cusparsestruct)->handle) {
1841       stat = cusparseDestroy(handle);CHKERRCUDA(stat);
1842     }
1843     delete *cusparsestruct;
1844     *cusparsestruct = 0;
1845   }
1846   PetscFunctionReturn(0);
1847 }
1848 
1849 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
1850 {
1851   PetscFunctionBegin;
1852   if (*mat) {
1853     delete (*mat)->values;
1854     delete (*mat)->column_indices;
1855     delete (*mat)->row_offsets;
1856     delete *mat;
1857     *mat = 0;
1858   }
1859   PetscFunctionReturn(0);
1860 }
1861 
1862 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
1863 {
1864   cusparseStatus_t stat;
1865   PetscErrorCode   ierr;
1866 
1867   PetscFunctionBegin;
1868   if (*trifactor) {
1869     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUDA(stat); }
1870     if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUDA(stat); }
1871     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
1872     delete *trifactor;
1873     *trifactor = 0;
1874   }
1875   PetscFunctionReturn(0);
1876 }
1877 
1878 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
1879 {
1880   CsrMatrix        *mat;
1881   cusparseStatus_t stat;
1882   cudaError_t      err;
1883 
1884   PetscFunctionBegin;
1885   if (*matstruct) {
1886     if ((*matstruct)->mat) {
1887       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
1888         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
1889         stat = cusparseDestroyHybMat(hybMat);CHKERRCUDA(stat);
1890       } else {
1891         mat = (CsrMatrix*)(*matstruct)->mat;
1892         CsrMatrix_Destroy(&mat);
1893       }
1894     }
1895     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUDA(stat); }
1896     delete (*matstruct)->cprowIndices;
1897     if ((*matstruct)->alpha)     { err=cudaFree((*matstruct)->alpha);CHKERRCUDA(err); }
1898     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
1899     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
1900     delete *matstruct;
1901     *matstruct = 0;
1902   }
1903   PetscFunctionReturn(0);
1904 }
1905 
1906 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
1907 {
1908   cusparseHandle_t handle;
1909   cusparseStatus_t stat;
1910 
1911   PetscFunctionBegin;
1912   if (*trifactors) {
1913     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);
1914     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);
1915     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);
1916     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);
1917     delete (*trifactors)->rpermIndices;
1918     delete (*trifactors)->cpermIndices;
1919     delete (*trifactors)->workVector;
1920     if (handle = (*trifactors)->handle) {
1921       stat = cusparseDestroy(handle);CHKERRCUDA(stat);
1922     }
1923     delete *trifactors;
1924     *trifactors = 0;
1925   }
1926   PetscFunctionReturn(0);
1927 }
1928 
1929