xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 5bd1e5768a3b141382a3ebb2e780cd2d3b3bfbf0)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 #define PETSC_SKIP_CXX_COMPLEX_FIX
7 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
8 
9 #include <petscconf.h>
10 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
11 #include <../src/mat/impls/sbaij/seq/sbaij.h>
12 #include <../src/vec/vec/impls/dvecimpl.h>
13 #include <petsc/private/vecimpl.h>
14 #undef VecType
15 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
16 
17 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
18 
19 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
20 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
21 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
22 
23 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
24 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
25 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
26 
27 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
28 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
29 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
30 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
31 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
32 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
33 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
34 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
35 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
36 
37 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
38 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
39 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
40 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
41 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
42 
43 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
44 {
45   cusparseStatus_t   stat;
46   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
47 
48   PetscFunctionBegin;
49   cusparsestruct->stream = stream;
50   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUDA(stat);
51   PetscFunctionReturn(0);
52 }
53 
54 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
55 {
56   cusparseStatus_t   stat;
57   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
58 
59   PetscFunctionBegin;
60   if (cusparsestruct->handle != handle) {
61     if (cusparsestruct->handle) {
62       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUDA(stat);
63     }
64     cusparsestruct->handle = handle;
65   }
66   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
67   PetscFunctionReturn(0);
68 }
69 
70 PetscErrorCode MatCUSPARSEClearHandle(Mat A)
71 {
72   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
73   PetscFunctionBegin;
74   if (cusparsestruct->handle)
75     cusparsestruct->handle = 0;
76   PetscFunctionReturn(0);
77 }
78 
79 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
80 {
81   PetscFunctionBegin;
82   *type = MATSOLVERCUSPARSE;
83   PetscFunctionReturn(0);
84 }
85 
86 /*MC
87   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
88   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
89   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
90   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
91   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
92   algorithms are not recommended. This class does NOT support direct solver operations.
93 
94   Level: beginner
95 
96 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
97 M*/
98 
99 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
100 {
101   PetscErrorCode ierr;
102   PetscInt       n = A->rmap->n;
103 
104   PetscFunctionBegin;
105   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
106   (*B)->factortype = ftype;
107   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
108   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
109 
110   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
111     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
112     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
113     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
114   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
115     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
116     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
117   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
118 
119   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
120   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
121   PetscFunctionReturn(0);
122 }
123 
124 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
125 {
126   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
127 
128   PetscFunctionBegin;
129   switch (op) {
130   case MAT_CUSPARSE_MULT:
131     cusparsestruct->format = format;
132     break;
133   case MAT_CUSPARSE_ALL:
134     cusparsestruct->format = format;
135     break;
136   default:
137     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
138   }
139   PetscFunctionReturn(0);
140 }
141 
142 /*@
143    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
144    operation. Only the MatMult operation can use different GPU storage formats
145    for MPIAIJCUSPARSE matrices.
146    Not Collective
147 
148    Input Parameters:
149 +  A - Matrix of type SEQAIJCUSPARSE
150 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
151 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
152 
153    Output Parameter:
154 
155    Level: intermediate
156 
157 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
158 @*/
159 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
160 {
161   PetscErrorCode ierr;
162 
163   PetscFunctionBegin;
164   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
165   ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
166   PetscFunctionReturn(0);
167 }
168 
169 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
170 {
171   PetscErrorCode           ierr;
172   MatCUSPARSEStorageFormat format;
173   PetscBool                flg;
174   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
175 
176   PetscFunctionBegin;
177   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
178   if (A->factortype==MAT_FACTOR_NONE) {
179     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
180                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
181     if (flg) {
182       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);
183     }
184   }
185   ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
186                           "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
187   if (flg) {
188     ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
189   }
190   ierr = PetscOptionsTail();CHKERRQ(ierr);
191   PetscFunctionReturn(0);
192 
193 }
194 
195 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
196 {
197   PetscErrorCode ierr;
198 
199   PetscFunctionBegin;
200   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
201   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
202   PetscFunctionReturn(0);
203 }
204 
205 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
206 {
207   PetscErrorCode ierr;
208 
209   PetscFunctionBegin;
210   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
211   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
212   PetscFunctionReturn(0);
213 }
214 
215 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
216 {
217   PetscErrorCode ierr;
218 
219   PetscFunctionBegin;
220   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
221   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
222   PetscFunctionReturn(0);
223 }
224 
225 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
226 {
227   PetscErrorCode ierr;
228 
229   PetscFunctionBegin;
230   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
231   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
232   PetscFunctionReturn(0);
233 }
234 
235 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
236 {
237   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
238   PetscInt                          n = A->rmap->n;
239   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
240   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
241   cusparseStatus_t                  stat;
242   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
243   const MatScalar                   *aa = a->a,*v;
244   PetscInt                          *AiLo, *AjLo;
245   PetscScalar                       *AALo;
246   PetscInt                          i,nz, nzLower, offset, rowOffset;
247   PetscErrorCode                    ierr;
248 
249   PetscFunctionBegin;
250   if (!n) PetscFunctionReturn(0);
251   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
252     try {
253       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
254       nzLower=n+ai[n]-ai[1];
255 
256       /* Allocate Space for the lower triangular matrix */
257       ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
258       ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(ierr);
259       ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(ierr);
260 
261       /* Fill the lower triangular matrix */
262       AiLo[0]  = (PetscInt) 0;
263       AiLo[n]  = nzLower;
264       AjLo[0]  = (PetscInt) 0;
265       AALo[0]  = (MatScalar) 1.0;
266       v        = aa;
267       vi       = aj;
268       offset   = 1;
269       rowOffset= 1;
270       for (i=1; i<n; i++) {
271         nz = ai[i+1] - ai[i];
272         /* additional 1 for the term on the diagonal */
273         AiLo[i]    = rowOffset;
274         rowOffset += nz+1;
275 
276         ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
277         ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
278 
279         offset      += nz;
280         AjLo[offset] = (PetscInt) i;
281         AALo[offset] = (MatScalar) 1.0;
282         offset      += 1;
283 
284         v  += nz;
285         vi += nz;
286       }
287 
288       /* allocate space for the triangular factor information */
289       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
290 
291       /* Create the matrix description */
292       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat);
293       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
294       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
295       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUDA(stat);
296       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat);
297 
298       /* Create the solve analysis information */
299       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat);
300 
301       /* set the operation */
302       loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
303 
304       /* set the matrix */
305       loTriFactor->csrMat = new CsrMatrix;
306       loTriFactor->csrMat->num_rows = n;
307       loTriFactor->csrMat->num_cols = n;
308       loTriFactor->csrMat->num_entries = nzLower;
309 
310       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
311       loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
312 
313       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
314       loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
315 
316       loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
317       loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
318 
319       /* perform the solve analysis */
320       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
321                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
322                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
323                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat);
324 
325       /* assign the pointer. Is this really necessary? */
326       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
327 
328       ierr = cudaFreeHost(AiLo);CHKERRCUDA(ierr);
329       ierr = cudaFreeHost(AjLo);CHKERRCUDA(ierr);
330       ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr);
331       ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
332     } catch(char *ex) {
333       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
334     }
335   }
336   PetscFunctionReturn(0);
337 }
338 
339 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
340 {
341   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
342   PetscInt                          n = A->rmap->n;
343   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
344   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
345   cusparseStatus_t                  stat;
346   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
347   const MatScalar                   *aa = a->a,*v;
348   PetscInt                          *AiUp, *AjUp;
349   PetscScalar                       *AAUp;
350   PetscInt                          i,nz, nzUpper, offset;
351   PetscErrorCode                    ierr;
352 
353   PetscFunctionBegin;
354   if (!n) PetscFunctionReturn(0);
355   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
356     try {
357       /* next, figure out the number of nonzeros in the upper triangular matrix. */
358       nzUpper = adiag[0]-adiag[n];
359 
360       /* Allocate Space for the upper triangular matrix */
361       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
362       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr);
363       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
364 
365       /* Fill the upper triangular matrix */
366       AiUp[0]=(PetscInt) 0;
367       AiUp[n]=nzUpper;
368       offset = nzUpper;
369       for (i=n-1; i>=0; i--) {
370         v  = aa + adiag[i+1] + 1;
371         vi = aj + adiag[i+1] + 1;
372 
373         /* number of elements NOT on the diagonal */
374         nz = adiag[i] - adiag[i+1]-1;
375 
376         /* decrement the offset */
377         offset -= (nz+1);
378 
379         /* first, set the diagonal elements */
380         AjUp[offset] = (PetscInt) i;
381         AAUp[offset] = (MatScalar)1./v[nz];
382         AiUp[i]      = AiUp[i+1] - (nz+1);
383 
384         ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
385         ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
386       }
387 
388       /* allocate space for the triangular factor information */
389       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
390 
391       /* Create the matrix description */
392       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat);
393       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
394       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
395       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
396       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat);
397 
398       /* Create the solve analysis information */
399       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat);
400 
401       /* set the operation */
402       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
403 
404       /* set the matrix */
405       upTriFactor->csrMat = new CsrMatrix;
406       upTriFactor->csrMat->num_rows = n;
407       upTriFactor->csrMat->num_cols = n;
408       upTriFactor->csrMat->num_entries = nzUpper;
409 
410       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
411       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
412 
413       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
414       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
415 
416       upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
417       upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
418 
419       /* perform the solve analysis */
420       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
421                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
422                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
423                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat);
424 
425       /* assign the pointer. Is this really necessary? */
426       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
427 
428       ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr);
429       ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr);
430       ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr);
431       ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
432     } catch(char *ex) {
433       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
434     }
435   }
436   PetscFunctionReturn(0);
437 }
438 
439 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
440 {
441   PetscErrorCode               ierr;
442   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
443   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
444   IS                           isrow = a->row,iscol = a->icol;
445   PetscBool                    row_identity,col_identity;
446   const PetscInt               *r,*c;
447   PetscInt                     n = A->rmap->n;
448 
449   PetscFunctionBegin;
450   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
451   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
452 
453   cusparseTriFactors->workVector = new THRUSTARRAY(n);
454   cusparseTriFactors->nnz=a->nz;
455 
456   A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH;
457   /* lower triangular indices */
458   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
459   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
460   if (!row_identity) {
461     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
462     cusparseTriFactors->rpermIndices->assign(r, r+n);
463   }
464   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
465 
466   /* upper triangular indices */
467   ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
468   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
469   if (!col_identity) {
470     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
471     cusparseTriFactors->cpermIndices->assign(c, c+n);
472   }
473 
474   if (!row_identity && !col_identity) {
475     ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr);
476   } else if(!row_identity) {
477     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
478   } else if(!col_identity) {
479     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
480   }
481 
482   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
483   PetscFunctionReturn(0);
484 }
485 
486 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
487 {
488   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
489   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
490   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
491   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
492   cusparseStatus_t                  stat;
493   PetscErrorCode                    ierr;
494   PetscInt                          *AiUp, *AjUp;
495   PetscScalar                       *AAUp;
496   PetscScalar                       *AALo;
497   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
498   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
499   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
500   const MatScalar                   *aa = b->a,*v;
501 
502   PetscFunctionBegin;
503   if (!n) PetscFunctionReturn(0);
504   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
505     try {
506       /* Allocate Space for the upper triangular matrix */
507       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
508       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr);
509       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
510       ierr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
511 
512       /* Fill the upper triangular matrix */
513       AiUp[0]=(PetscInt) 0;
514       AiUp[n]=nzUpper;
515       offset = 0;
516       for (i=0; i<n; i++) {
517         /* set the pointers */
518         v  = aa + ai[i];
519         vj = aj + ai[i];
520         nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
521 
522         /* first, set the diagonal elements */
523         AjUp[offset] = (PetscInt) i;
524         AAUp[offset] = (MatScalar)1.0/v[nz];
525         AiUp[i]      = offset;
526         AALo[offset] = (MatScalar)1.0/v[nz];
527 
528         offset+=1;
529         if (nz>0) {
530           ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
531           ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
532           for (j=offset; j<offset+nz; j++) {
533             AAUp[j] = -AAUp[j];
534             AALo[j] = AAUp[j]/v[nz];
535           }
536           offset+=nz;
537         }
538       }
539 
540       /* allocate space for the triangular factor information */
541       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
542 
543       /* Create the matrix description */
544       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat);
545       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
546       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
547       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
548       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat);
549 
550       /* Create the solve analysis information */
551       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat);
552 
553       /* set the operation */
554       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
555 
556       /* set the matrix */
557       upTriFactor->csrMat = new CsrMatrix;
558       upTriFactor->csrMat->num_rows = A->rmap->n;
559       upTriFactor->csrMat->num_cols = A->cmap->n;
560       upTriFactor->csrMat->num_entries = a->nz;
561 
562       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
563       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
564 
565       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
566       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
567 
568       upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
569       upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
570 
571       /* perform the solve analysis */
572       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
573                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
574                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
575                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat);
576 
577       /* assign the pointer. Is this really necessary? */
578       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
579 
580       /* allocate space for the triangular factor information */
581       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
582 
583       /* Create the matrix description */
584       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat);
585       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
586       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
587       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
588       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat);
589 
590       /* Create the solve analysis information */
591       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat);
592 
593       /* set the operation */
594       loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
595 
596       /* set the matrix */
597       loTriFactor->csrMat = new CsrMatrix;
598       loTriFactor->csrMat->num_rows = A->rmap->n;
599       loTriFactor->csrMat->num_cols = A->cmap->n;
600       loTriFactor->csrMat->num_entries = a->nz;
601 
602       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
603       loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
604 
605       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
606       loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
607 
608       loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
609       loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
610       ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
611 
612       /* perform the solve analysis */
613       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
614                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
615                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
616                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat);
617 
618       /* assign the pointer. Is this really necessary? */
619       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
620 
621       A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH;
622       ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr);
623       ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr);
624       ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr);
625       ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr);
626     } catch(char *ex) {
627       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
628     }
629   }
630   PetscFunctionReturn(0);
631 }
632 
633 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
634 {
635   PetscErrorCode               ierr;
636   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
637   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
638   IS                           ip = a->row;
639   const PetscInt               *rip;
640   PetscBool                    perm_identity;
641   PetscInt                     n = A->rmap->n;
642 
643   PetscFunctionBegin;
644   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
645   cusparseTriFactors->workVector = new THRUSTARRAY(n);
646   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
647 
648   /* lower triangular indices */
649   ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
650   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
651   if (!perm_identity) {
652     IS             iip;
653     const PetscInt *irip;
654 
655     ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr);
656     ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr);
657     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
658     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
659     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
660     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
661     ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr);
662     ierr = ISDestroy(&iip);CHKERRQ(ierr);
663     ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr);
664   }
665   ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
666   PetscFunctionReturn(0);
667 }
668 
669 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
670 {
671   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
672   IS             isrow = b->row,iscol = b->col;
673   PetscBool      row_identity,col_identity;
674   PetscErrorCode ierr;
675 
676   PetscFunctionBegin;
677   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
678   /* determine which version of MatSolve needs to be used. */
679   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
680   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
681   if (row_identity && col_identity) {
682     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
683     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
684     B->ops->matsolve = NULL;
685     B->ops->matsolvetranspose = NULL;
686   } else {
687     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
688     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
689     B->ops->matsolve = NULL;
690     B->ops->matsolvetranspose = NULL;
691   }
692 
693   /* get the triangular factors */
694   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
695   PetscFunctionReturn(0);
696 }
697 
698 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
699 {
700   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
701   IS             ip = b->row;
702   PetscBool      perm_identity;
703   PetscErrorCode ierr;
704 
705   PetscFunctionBegin;
706   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
707 
708   /* determine which version of MatSolve needs to be used. */
709   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
710   if (perm_identity) {
711     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
712     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
713     B->ops->matsolve = NULL;
714     B->ops->matsolvetranspose = NULL;
715   } else {
716     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
717     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
718     B->ops->matsolve = NULL;
719     B->ops->matsolvetranspose = NULL;
720   }
721 
722   /* get the triangular factors */
723   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
724   PetscFunctionReturn(0);
725 }
726 
727 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
728 {
729   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
730   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
731   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
732   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
733   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
734   cusparseStatus_t                  stat;
735   cusparseIndexBase_t               indexBase;
736   cusparseMatrixType_t              matrixType;
737   cusparseFillMode_t                fillMode;
738   cusparseDiagType_t                diagType;
739 
740   PetscFunctionBegin;
741 
742   /*********************************************/
743   /* Now the Transpose of the Lower Tri Factor */
744   /*********************************************/
745 
746   /* allocate space for the transpose of the lower triangular factor */
747   loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
748 
749   /* set the matrix descriptors of the lower triangular factor */
750   matrixType = cusparseGetMatType(loTriFactor->descr);
751   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
752   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
753     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
754   diagType = cusparseGetMatDiagType(loTriFactor->descr);
755 
756   /* Create the matrix description */
757   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUDA(stat);
758   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUDA(stat);
759   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUDA(stat);
760   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUDA(stat);
761   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUDA(stat);
762 
763   /* Create the solve analysis information */
764   stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUDA(stat);
765 
766   /* set the operation */
767   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
768 
769   /* allocate GPU space for the CSC of the lower triangular factor*/
770   loTriFactorT->csrMat = new CsrMatrix;
771   loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows;
772   loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols;
773   loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
774   loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1);
775   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries);
776   loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries);
777 
778   /* compute the transpose of the lower triangular factor, i.e. the CSC */
779   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
780                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
781                           loTriFactor->csrMat->values->data().get(),
782                           loTriFactor->csrMat->row_offsets->data().get(),
783                           loTriFactor->csrMat->column_indices->data().get(),
784                           loTriFactorT->csrMat->values->data().get(),
785                           loTriFactorT->csrMat->column_indices->data().get(),
786                           loTriFactorT->csrMat->row_offsets->data().get(),
787                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
788 
789   /* perform the solve analysis on the transposed matrix */
790   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
791                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries,
792                            loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
793                            loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(),
794                            loTriFactorT->solveInfo);CHKERRCUDA(stat);
795 
796   /* assign the pointer. Is this really necessary? */
797   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
798 
799   /*********************************************/
800   /* Now the Transpose of the Upper Tri Factor */
801   /*********************************************/
802 
803   /* allocate space for the transpose of the upper triangular factor */
804   upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
805 
806   /* set the matrix descriptors of the upper triangular factor */
807   matrixType = cusparseGetMatType(upTriFactor->descr);
808   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
809   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
810     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
811   diagType = cusparseGetMatDiagType(upTriFactor->descr);
812 
813   /* Create the matrix description */
814   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUDA(stat);
815   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUDA(stat);
816   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUDA(stat);
817   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUDA(stat);
818   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUDA(stat);
819 
820   /* Create the solve analysis information */
821   stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUDA(stat);
822 
823   /* set the operation */
824   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
825 
826   /* allocate GPU space for the CSC of the upper triangular factor*/
827   upTriFactorT->csrMat = new CsrMatrix;
828   upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows;
829   upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols;
830   upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
831   upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1);
832   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries);
833   upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries);
834 
835   /* compute the transpose of the upper triangular factor, i.e. the CSC */
836   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
837                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
838                           upTriFactor->csrMat->values->data().get(),
839                           upTriFactor->csrMat->row_offsets->data().get(),
840                           upTriFactor->csrMat->column_indices->data().get(),
841                           upTriFactorT->csrMat->values->data().get(),
842                           upTriFactorT->csrMat->column_indices->data().get(),
843                           upTriFactorT->csrMat->row_offsets->data().get(),
844                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
845 
846   /* perform the solve analysis on the transposed matrix */
847   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
848                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries,
849                            upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
850                            upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(),
851                            upTriFactorT->solveInfo);CHKERRCUDA(stat);
852 
853   /* assign the pointer. Is this really necessary? */
854   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
855   PetscFunctionReturn(0);
856 }
857 
858 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
859 {
860   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
861   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
862   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
863   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
864   cusparseStatus_t             stat;
865   cusparseIndexBase_t          indexBase;
866   cudaError_t                  err;
867   PetscErrorCode               ierr;
868 
869   PetscFunctionBegin;
870 
871   /* allocate space for the triangular factor information */
872   matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
873   stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUDA(stat);
874   indexBase = cusparseGetMatIndexBase(matstruct->descr);
875   stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUDA(stat);
876   stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat);
877 
878   /* set alpha and beta */
879   err = cudaMalloc((void **)&(matstructT->alpha),    sizeof(PetscScalar));CHKERRCUDA(err);
880   err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
881   err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
882   err = cudaMemcpy(matstructT->alpha,    &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
883   err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
884   err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
885   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
886 
887   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
888     CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
889     CsrMatrix *matrixT= new CsrMatrix;
890     thrust::device_vector<int> *rowoffsets_gpu;
891     const int ncomprow = matstruct->cprowIndices->size();
892     thrust::host_vector<int> cprow(ncomprow);
893     cprow = *matstruct->cprowIndices; // GPU --> CPU
894     matrixT->num_rows = A->cmap->n;
895     matrixT->num_cols = A->rmap->n;
896     matrixT->num_entries = a->nz;
897     matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
898     matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
899     matrixT->values = new THRUSTARRAY(a->nz);
900     PetscInt k,i;
901     thrust::host_vector<int> rowst_host(A->rmap->n+1);
902 
903     /* expand compress rows, which is forced in constructor (MatSeqAIJCUSPARSECopyToGPU) */
904     rowst_host[0] = 0;
905     for (k = 0, i = 0; i < A->rmap->n ; i++) {
906       if (k < ncomprow && i==cprow[k]) {
907 	rowst_host[i+1] = a->i[i+1];
908 	k++;
909       } else rowst_host[i+1] = rowst_host[i];
910     }
911     if (k!=ncomprow) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"%D != %D",k,ncomprow);
912     rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
913     *rowoffsets_gpu = rowst_host; // CPU --> GPU
914 
915     /* compute the transpose of the upper triangular factor, i.e. the CSC */
916     indexBase = cusparseGetMatIndexBase(matstruct->descr);
917     stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
918                             A->cmap->n, matrix->num_entries,
919                             matrix->values->data().get(),
920                             rowoffsets_gpu->data().get(),
921                             matrix->column_indices->data().get(),
922                             matrixT->values->data().get(),
923                             matrixT->column_indices->data().get(),
924                             matrixT->row_offsets->data().get(),
925                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
926 
927     /* assign the pointer */
928     matstructT->mat = matrixT;
929     ierr = PetscLogCpuToGpu(((A->rmap->n+1)+(a->nz))*sizeof(int)+(3+a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
930   } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
931     /* First convert HYB to CSR */
932     CsrMatrix *temp= new CsrMatrix;
933     temp->num_rows = A->rmap->n;
934     temp->num_cols = A->cmap->n;
935     temp->num_entries = a->nz;
936     temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
937     temp->column_indices = new THRUSTINTARRAY32(a->nz);
938     temp->values = new THRUSTARRAY(a->nz);
939 
940 
941     stat = cusparse_hyb2csr(cusparsestruct->handle,
942                             matstruct->descr, (cusparseHybMat_t)matstruct->mat,
943                             temp->values->data().get(),
944                             temp->row_offsets->data().get(),
945                             temp->column_indices->data().get());CHKERRCUDA(stat);
946 
947     /* Next, convert CSR to CSC (i.e. the matrix transpose) */
948     CsrMatrix *tempT= new CsrMatrix;
949     tempT->num_rows = A->rmap->n;
950     tempT->num_cols = A->cmap->n;
951     tempT->num_entries = a->nz;
952     tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
953     tempT->column_indices = new THRUSTINTARRAY32(a->nz);
954     tempT->values = new THRUSTARRAY(a->nz);
955 
956     stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
957                             temp->num_cols, temp->num_entries,
958                             temp->values->data().get(),
959                             temp->row_offsets->data().get(),
960                             temp->column_indices->data().get(),
961                             tempT->values->data().get(),
962                             tempT->column_indices->data().get(),
963                             tempT->row_offsets->data().get(),
964                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
965 
966     /* Last, convert CSC to HYB */
967     cusparseHybMat_t hybMat;
968     stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat);
969     cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
970       CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
971     stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
972                             matstructT->descr, tempT->values->data().get(),
973                             tempT->row_offsets->data().get(),
974                             tempT->column_indices->data().get(),
975                             hybMat, 0, partition);CHKERRCUDA(stat);
976 
977     /* assign the pointer */
978     matstructT->mat = hybMat;
979     ierr = PetscLogCpuToGpu((2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)))+3*sizeof(PetscScalar));CHKERRQ(ierr);
980 
981     /* delete temporaries */
982     if (tempT) {
983       if (tempT->values) delete (THRUSTARRAY*) tempT->values;
984       if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
985       if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
986       delete (CsrMatrix*) tempT;
987     }
988     if (temp) {
989       if (temp->values) delete (THRUSTARRAY*) temp->values;
990       if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
991       if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
992       delete (CsrMatrix*) temp;
993     }
994   }
995   /* assign the compressed row indices */
996   matstructT->cprowIndices = new THRUSTINTARRAY;
997   matstructT->cprowIndices->resize(A->cmap->n);
998   thrust::sequence(matstructT->cprowIndices->begin(), matstructT->cprowIndices->end());
999   /* assign the pointer */
1000   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1001   PetscFunctionReturn(0);
1002 }
1003 
1004 /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1005 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1006 {
1007   PetscInt                              n = xx->map->n;
1008   const PetscScalar                     *barray;
1009   PetscScalar                           *xarray;
1010   thrust::device_ptr<const PetscScalar> bGPU;
1011   thrust::device_ptr<PetscScalar>       xGPU;
1012   cusparseStatus_t                      stat;
1013   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1014   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1015   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1016   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1017   PetscErrorCode                        ierr;
1018 
1019   PetscFunctionBegin;
1020   /* Analyze the matrix and create the transpose ... on the fly */
1021   if (!loTriFactorT && !upTriFactorT) {
1022     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1023     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1024     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1025   }
1026 
1027   /* Get the GPU pointers */
1028   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1029   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1030   xGPU = thrust::device_pointer_cast(xarray);
1031   bGPU = thrust::device_pointer_cast(barray);
1032 
1033   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1034   /* First, reorder with the row permutation */
1035   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1036                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1037                xGPU);
1038 
1039   /* First, solve U */
1040   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1041                         upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1042                         upTriFactorT->csrMat->values->data().get(),
1043                         upTriFactorT->csrMat->row_offsets->data().get(),
1044                         upTriFactorT->csrMat->column_indices->data().get(),
1045                         upTriFactorT->solveInfo,
1046                         xarray, tempGPU->data().get());CHKERRCUDA(stat);
1047 
1048   /* Then, solve L */
1049   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1050                         loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1051                         loTriFactorT->csrMat->values->data().get(),
1052                         loTriFactorT->csrMat->row_offsets->data().get(),
1053                         loTriFactorT->csrMat->column_indices->data().get(),
1054                         loTriFactorT->solveInfo,
1055                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1056 
1057   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1058   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1059                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1060                tempGPU->begin());
1061 
1062   /* Copy the temporary to the full solution. */
1063   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1064 
1065   /* restore */
1066   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1067   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1068   ierr = WaitForGPU();CHKERRCUDA(ierr);
1069   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1070   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1071   PetscFunctionReturn(0);
1072 }
1073 
1074 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1075 {
1076   const PetscScalar                 *barray;
1077   PetscScalar                       *xarray;
1078   cusparseStatus_t                  stat;
1079   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1080   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1081   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1082   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1083   PetscErrorCode                    ierr;
1084 
1085   PetscFunctionBegin;
1086   /* Analyze the matrix and create the transpose ... on the fly */
1087   if (!loTriFactorT && !upTriFactorT) {
1088     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1089     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1090     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1091   }
1092 
1093   /* Get the GPU pointers */
1094   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1095   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1096 
1097   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1098   /* First, solve U */
1099   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1100                         upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1101                         upTriFactorT->csrMat->values->data().get(),
1102                         upTriFactorT->csrMat->row_offsets->data().get(),
1103                         upTriFactorT->csrMat->column_indices->data().get(),
1104                         upTriFactorT->solveInfo,
1105                         barray, tempGPU->data().get());CHKERRCUDA(stat);
1106 
1107   /* Then, solve L */
1108   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1109                         loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1110                         loTriFactorT->csrMat->values->data().get(),
1111                         loTriFactorT->csrMat->row_offsets->data().get(),
1112                         loTriFactorT->csrMat->column_indices->data().get(),
1113                         loTriFactorT->solveInfo,
1114                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1115 
1116   /* restore */
1117   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1118   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1119   ierr = WaitForGPU();CHKERRCUDA(ierr);
1120   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1121   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1122   PetscFunctionReturn(0);
1123 }
1124 
1125 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1126 {
1127   const PetscScalar                     *barray;
1128   PetscScalar                           *xarray;
1129   thrust::device_ptr<const PetscScalar> bGPU;
1130   thrust::device_ptr<PetscScalar>       xGPU;
1131   cusparseStatus_t                      stat;
1132   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1133   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1134   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1135   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1136   PetscErrorCode                        ierr;
1137 
1138   PetscFunctionBegin;
1139 
1140   /* Get the GPU pointers */
1141   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1142   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1143   xGPU = thrust::device_pointer_cast(xarray);
1144   bGPU = thrust::device_pointer_cast(barray);
1145 
1146   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1147   /* First, reorder with the row permutation */
1148   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1149                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1150                tempGPU->begin());
1151 
1152   /* Next, solve L */
1153   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1154                         loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1155                         loTriFactor->csrMat->values->data().get(),
1156                         loTriFactor->csrMat->row_offsets->data().get(),
1157                         loTriFactor->csrMat->column_indices->data().get(),
1158                         loTriFactor->solveInfo,
1159                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1160 
1161   /* Then, solve U */
1162   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1163                         upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1164                         upTriFactor->csrMat->values->data().get(),
1165                         upTriFactor->csrMat->row_offsets->data().get(),
1166                         upTriFactor->csrMat->column_indices->data().get(),
1167                         upTriFactor->solveInfo,
1168                         xarray, tempGPU->data().get());CHKERRCUDA(stat);
1169 
1170   /* Last, reorder with the column permutation */
1171   thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1172                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
1173                xGPU);
1174 
1175   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1176   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1177   ierr = WaitForGPU();CHKERRCUDA(ierr);
1178   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1179   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1180   PetscFunctionReturn(0);
1181 }
1182 
1183 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1184 {
1185   const PetscScalar                 *barray;
1186   PetscScalar                       *xarray;
1187   cusparseStatus_t                  stat;
1188   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1189   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1190   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1191   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1192   PetscErrorCode                    ierr;
1193 
1194   PetscFunctionBegin;
1195   /* Get the GPU pointers */
1196   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1197   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1198 
1199   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1200   /* First, solve L */
1201   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1202                         loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1203                         loTriFactor->csrMat->values->data().get(),
1204                         loTriFactor->csrMat->row_offsets->data().get(),
1205                         loTriFactor->csrMat->column_indices->data().get(),
1206                         loTriFactor->solveInfo,
1207                         barray, tempGPU->data().get());CHKERRCUDA(stat);
1208 
1209   /* Next, solve U */
1210   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1211                         upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1212                         upTriFactor->csrMat->values->data().get(),
1213                         upTriFactor->csrMat->row_offsets->data().get(),
1214                         upTriFactor->csrMat->column_indices->data().get(),
1215                         upTriFactor->solveInfo,
1216                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1217 
1218   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1219   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1220   ierr = WaitForGPU();CHKERRCUDA(ierr);
1221   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1222   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1223   PetscFunctionReturn(0);
1224 }
1225 
1226 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1227 {
1228   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1229   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1230   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1231   PetscInt                     m = A->rmap->n,*ii,*ridx;
1232   PetscErrorCode               ierr;
1233   cusparseStatus_t             stat;
1234   cudaError_t                  err;
1235 
1236   PetscFunctionBegin;
1237   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
1238     ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1239     if (A->assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) {
1240       CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
1241       /* copy values only */
1242       matrix->values->assign(a->a, a->a+a->nz);
1243       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
1244     } else {
1245       MatSeqAIJCUSPARSEMultStruct_Destroy(&matstruct,cusparsestruct->format);
1246       try {
1247         cusparsestruct->nonzerorow=0;
1248         for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0);
1249 
1250         if (a->compressedrow.use) {
1251           m    = a->compressedrow.nrows;
1252           ii   = a->compressedrow.i;
1253           ridx = a->compressedrow.rindex;
1254         } else {
1255           /* Forcing compressed row on the GPU */
1256           int k=0;
1257           ierr = PetscMalloc1(cusparsestruct->nonzerorow+1, &ii);CHKERRQ(ierr);
1258           ierr = PetscMalloc1(cusparsestruct->nonzerorow, &ridx);CHKERRQ(ierr);
1259           ii[0]=0;
1260           for (int j = 0; j<m; j++) {
1261             if ((a->i[j+1]-a->i[j])>0) {
1262               ii[k]  = a->i[j];
1263               ridx[k]= j;
1264               k++;
1265             }
1266           }
1267           ii[cusparsestruct->nonzerorow] = a->nz;
1268           m = cusparsestruct->nonzerorow;
1269         }
1270 
1271         /* allocate space for the triangular factor information */
1272         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1273         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUDA(stat);
1274         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
1275         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat);
1276 
1277         err = cudaMalloc((void **)&(matstruct->alpha),    sizeof(PetscScalar));CHKERRCUDA(err);
1278         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1279         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1280         err = cudaMemcpy(matstruct->alpha,    &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1281         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1282         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1283         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
1284 
1285         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1286         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1287           /* set the matrix */
1288           CsrMatrix *matrix= new CsrMatrix;
1289           matrix->num_rows = m;
1290           matrix->num_cols = A->cmap->n;
1291           matrix->num_entries = a->nz;
1292           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1293           matrix->row_offsets->assign(ii, ii + m+1);
1294 
1295           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1296           matrix->column_indices->assign(a->j, a->j+a->nz);
1297 
1298           matrix->values = new THRUSTARRAY(a->nz);
1299           matrix->values->assign(a->a, a->a+a->nz);
1300 
1301           /* assign the pointer */
1302           matstruct->mat = matrix;
1303 
1304         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1305           CsrMatrix *matrix= new CsrMatrix;
1306           matrix->num_rows = m;
1307           matrix->num_cols = A->cmap->n;
1308           matrix->num_entries = a->nz;
1309           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1310           matrix->row_offsets->assign(ii, ii + m+1);
1311 
1312           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1313           matrix->column_indices->assign(a->j, a->j+a->nz);
1314 
1315           matrix->values = new THRUSTARRAY(a->nz);
1316           matrix->values->assign(a->a, a->a+a->nz);
1317 
1318           cusparseHybMat_t hybMat;
1319           stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat);
1320           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1321             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1322           stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols,
1323               matstruct->descr, matrix->values->data().get(),
1324               matrix->row_offsets->data().get(),
1325               matrix->column_indices->data().get(),
1326               hybMat, 0, partition);CHKERRCUDA(stat);
1327           /* assign the pointer */
1328           matstruct->mat = hybMat;
1329 
1330           if (matrix) {
1331             if (matrix->values) delete (THRUSTARRAY*)matrix->values;
1332             if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices;
1333             if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets;
1334             delete (CsrMatrix*)matrix;
1335           }
1336         }
1337 
1338         /* assign the compressed row indices */
1339         matstruct->cprowIndices = new THRUSTINTARRAY(m);
1340         matstruct->cprowIndices->assign(ridx,ridx+m);
1341         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+m*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
1342 
1343         /* assign the pointer */
1344         cusparsestruct->mat = matstruct;
1345 
1346         if (!a->compressedrow.use) {
1347           ierr = PetscFree(ii);CHKERRQ(ierr);
1348           ierr = PetscFree(ridx);CHKERRQ(ierr);
1349         }
1350         cusparsestruct->workVector = new THRUSTARRAY(m);
1351       } catch(char *ex) {
1352         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1353       }
1354       cusparsestruct->nonzerostate = A->nonzerostate;
1355     }
1356     ierr = WaitForGPU();CHKERRCUDA(ierr);
1357     A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH;
1358     ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1359   }
1360   PetscFunctionReturn(0);
1361 }
1362 
1363 struct VecCUDAPlusEquals
1364 {
1365   template <typename Tuple>
1366   __host__ __device__
1367   void operator()(Tuple t)
1368   {
1369     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1370   }
1371 };
1372 
1373 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1374 {
1375   PetscErrorCode ierr;
1376 
1377   PetscFunctionBegin;
1378   ierr = MatMultAdd_SeqAIJCUSPARSE(A,xx,NULL,yy);CHKERRQ(ierr);
1379   PetscFunctionReturn(0);
1380 }
1381 
1382 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1383 {
1384   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1385   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1386   Mat_SeqAIJCUSPARSEMultStruct *matstructT;
1387   const PetscScalar            *xarray;
1388   PetscScalar                  *yarray;
1389   PetscErrorCode               ierr;
1390   cusparseStatus_t             stat;
1391 
1392   PetscFunctionBegin;
1393   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1394   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1395   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1396   if (!matstructT) {
1397     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1398     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1399   }
1400   ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1401   ierr = VecCUDAGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
1402 
1403   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1404   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1405     CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1406     stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1407                              mat->num_rows, mat->num_cols,
1408                              mat->num_entries, matstructT->alpha, matstructT->descr,
1409                              mat->values->data().get(), mat->row_offsets->data().get(),
1410                              mat->column_indices->data().get(), xarray, matstructT->beta_zero,
1411                              yarray);CHKERRCUDA(stat);
1412   } else {
1413     cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1414     stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1415                              matstructT->alpha, matstructT->descr, hybMat,
1416                              xarray, matstructT->beta_zero,
1417                              yarray);CHKERRCUDA(stat);
1418   }
1419   ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1420   ierr = VecCUDARestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
1421   ierr = WaitForGPU();CHKERRCUDA(ierr);
1422   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1423   ierr = PetscLogGpuFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr);
1424   PetscFunctionReturn(0);
1425 }
1426 
1427 
1428 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1429 {
1430   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1431   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1432   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
1433   const PetscScalar            *xarray;
1434   PetscScalar                  *zarray,*dptr,*beta;
1435   PetscErrorCode               ierr;
1436   cusparseStatus_t             stat;
1437   PetscBool                    cmpr; /* if the matrix has been compressed (zero rows) */
1438 
1439   PetscFunctionBegin;
1440   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1441   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1442   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1443   try {
1444     cmpr = (PetscBool)(cusparsestruct->workVector->size() == (thrust::detail::vector_base<PetscScalar, thrust::device_malloc_allocator<PetscScalar> >::size_type)(A->rmap->n));
1445     ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1446     if (yy && !cmpr) { /* MatMultAdd with noncompressed storage -> need uptodate zz vector */
1447       ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);
1448     } else {
1449       ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
1450     }
1451     dptr = cmpr ? zarray : cusparsestruct->workVector->data().get();
1452     beta = (yy == zz && dptr == zarray) ? matstruct->beta_one : matstruct->beta_zero;
1453 
1454     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1455     /* csr_spmv is multiply add */
1456     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1457       /* here we need to be careful to set the number of rows in the multiply to the
1458          number of compressed rows in the matrix ... which is equivalent to the
1459          size of the workVector */
1460       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1461       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1462                                mat->num_rows, mat->num_cols,
1463                                mat->num_entries, matstruct->alpha, matstruct->descr,
1464                                mat->values->data().get(), mat->row_offsets->data().get(),
1465                                mat->column_indices->data().get(), xarray, beta,
1466                                dptr);CHKERRCUDA(stat);
1467     } else {
1468       if (cusparsestruct->workVector->size()) {
1469         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1470         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1471                                  matstruct->alpha, matstruct->descr, hybMat,
1472                                  xarray, beta,
1473                                  dptr);CHKERRCUDA(stat);
1474       }
1475     }
1476     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1477 
1478     if (yy) {
1479       if (dptr != zarray) {
1480         ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);
1481       } else if (zz != yy) {
1482         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr);
1483       }
1484     } else if (dptr != zarray) {
1485       ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
1486     }
1487     /* scatter the data from the temporary into the full vector with a += operation */
1488     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1489     if (dptr != zarray) {
1490       thrust::device_ptr<PetscScalar> zptr;
1491 
1492       zptr = thrust::device_pointer_cast(zarray);
1493       thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
1494                        thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(),
1495                        VecCUDAPlusEquals());
1496     }
1497     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1498     ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1499     if (yy && !cmpr) {
1500       ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);
1501     } else {
1502       ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
1503     }
1504   } catch(char *ex) {
1505     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1506   }
1507   ierr = WaitForGPU();CHKERRCUDA(ierr);
1508   ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
1509   PetscFunctionReturn(0);
1510 }
1511 
1512 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1513 {
1514   Mat_SeqAIJ                      *a = (Mat_SeqAIJ*)A->data;
1515   Mat_SeqAIJCUSPARSE              *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1516   Mat_SeqAIJCUSPARSEMultStruct    *matstructT;
1517   const PetscScalar               *xarray;
1518   PetscScalar                     *zarray,*dptr,*beta;
1519   PetscErrorCode                  ierr;
1520   cusparseStatus_t                stat;
1521 
1522   PetscFunctionBegin;
1523   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1524   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1525   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1526   if (!matstructT) {
1527     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1528     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1529   }
1530 
1531   try {
1532     ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);
1533     ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1534     ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);
1535     dptr = cusparsestruct->workVector->size() == (thrust::detail::vector_base<PetscScalar, thrust::device_malloc_allocator<PetscScalar> >::size_type)(A->cmap->n) ? zarray : cusparsestruct->workVector->data().get();
1536     beta = (yy == zz && dptr == zarray) ? matstructT->beta_one : matstructT->beta_zero;
1537 
1538     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1539     /* multiply add with matrix transpose */
1540     if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1541       CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1542       /* here we need to be careful to set the number of rows in the multiply to the
1543          number of compressed rows in the matrix ... which is equivalent to the
1544          size of the workVector */
1545       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1546                                mat->num_rows, mat->num_cols,
1547                                mat->num_entries, matstructT->alpha, matstructT->descr,
1548                                mat->values->data().get(), mat->row_offsets->data().get(),
1549                                mat->column_indices->data().get(), xarray, beta,
1550                                dptr);CHKERRCUDA(stat);
1551     } else {
1552       cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1553       if (cusparsestruct->workVector->size()) {
1554         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1555                                  matstructT->alpha, matstructT->descr, hybMat,
1556                                  xarray, beta,
1557                                  dptr);CHKERRCUDA(stat);
1558       }
1559     }
1560     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1561 
1562     if (dptr != zarray) {
1563       ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);
1564     } else if (zz != yy) {
1565       ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr);
1566     }
1567     /* scatter the data from the temporary into the full vector with a += operation */
1568     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1569     if (dptr != zarray) {
1570       thrust::device_ptr<PetscScalar> zptr;
1571 
1572       zptr = thrust::device_pointer_cast(zarray);
1573 
1574       /* scatter the data from the temporary into the full vector with a += operation */
1575       thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))),
1576                        thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))) + A->cmap->n,
1577                        VecCUDAPlusEquals());
1578     }
1579     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1580     ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1581     ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);
1582   } catch(char *ex) {
1583     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1584   }
1585   ierr = WaitForGPU();CHKERRCUDA(ierr); /* is this needed? just for yy==0 in Mult */
1586   ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
1587   PetscFunctionReturn(0);
1588 }
1589 
1590 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
1591 {
1592   PetscErrorCode ierr;
1593 
1594   PetscFunctionBegin;
1595   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
1596   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
1597   if (A->factortype == MAT_FACTOR_NONE) {
1598     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1599   }
1600   A->ops->mult             = MatMult_SeqAIJCUSPARSE;
1601   A->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1602   A->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1603   A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1604   PetscFunctionReturn(0);
1605 }
1606 
1607 /* --------------------------------------------------------------------------------*/
1608 /*@
1609    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
1610    (the default parallel PETSc format). This matrix will ultimately pushed down
1611    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
1612    assembly performance the user should preallocate the matrix storage by setting
1613    the parameter nz (or the array nnz).  By setting these parameters accurately,
1614    performance during matrix assembly can be increased by more than a factor of 50.
1615 
1616    Collective
1617 
1618    Input Parameters:
1619 +  comm - MPI communicator, set to PETSC_COMM_SELF
1620 .  m - number of rows
1621 .  n - number of columns
1622 .  nz - number of nonzeros per row (same for all rows)
1623 -  nnz - array containing the number of nonzeros in the various rows
1624          (possibly different for each row) or NULL
1625 
1626    Output Parameter:
1627 .  A - the matrix
1628 
1629    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1630    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1631    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
1632 
1633    Notes:
1634    If nnz is given then nz is ignored
1635 
1636    The AIJ format (also called the Yale sparse matrix format or
1637    compressed row storage), is fully compatible with standard Fortran 77
1638    storage.  That is, the stored row and column indices can begin at
1639    either one (as in Fortran) or zero.  See the users' manual for details.
1640 
1641    Specify the preallocated storage with either nz or nnz (not both).
1642    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1643    allocation.  For large problems you MUST preallocate memory or you
1644    will get TERRIBLE performance, see the users' manual chapter on matrices.
1645 
1646    By default, this format uses inodes (identical nodes) when possible, to
1647    improve numerical efficiency of matrix-vector products and solves. We
1648    search for consecutive rows with the same nonzero structure, thereby
1649    reusing matrix information to achieve increased efficiency.
1650 
1651    Level: intermediate
1652 
1653 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
1654 @*/
1655 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1656 {
1657   PetscErrorCode ierr;
1658 
1659   PetscFunctionBegin;
1660   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1661   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
1662   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1663   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
1664   PetscFunctionReturn(0);
1665 }
1666 
1667 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
1668 {
1669   PetscErrorCode   ierr;
1670 
1671   PetscFunctionBegin;
1672   if (A->factortype==MAT_FACTOR_NONE) {
1673     if (A->valid_GPU_matrix != PETSC_OFFLOAD_UNALLOCATED) {
1674       ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
1675     }
1676   } else {
1677     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
1678   }
1679   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
1680   PetscFunctionReturn(0);
1681 }
1682 
1683 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
1684 {
1685   PetscErrorCode ierr;
1686   Mat C;
1687   cusparseStatus_t stat;
1688   cusparseHandle_t handle=0;
1689 
1690   PetscFunctionBegin;
1691   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
1692   C    = *B;
1693   ierr = PetscFree(C->defaultvectype);CHKERRQ(ierr);
1694   ierr = PetscStrallocpy(VECCUDA,&C->defaultvectype);CHKERRQ(ierr);
1695 
1696   /* inject CUSPARSE-specific stuff */
1697   if (C->factortype==MAT_FACTOR_NONE) {
1698     /* you cannot check the inode.use flag here since the matrix was just created.
1699        now build a GPU matrix data structure */
1700     C->spptr = new Mat_SeqAIJCUSPARSE;
1701     ((Mat_SeqAIJCUSPARSE*)C->spptr)->mat          = 0;
1702     ((Mat_SeqAIJCUSPARSE*)C->spptr)->matTranspose = 0;
1703     ((Mat_SeqAIJCUSPARSE*)C->spptr)->workVector   = 0;
1704     ((Mat_SeqAIJCUSPARSE*)C->spptr)->format       = MAT_CUSPARSE_CSR;
1705     ((Mat_SeqAIJCUSPARSE*)C->spptr)->stream       = 0;
1706     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1707     ((Mat_SeqAIJCUSPARSE*)C->spptr)->handle       = handle;
1708     ((Mat_SeqAIJCUSPARSE*)C->spptr)->nonzerostate = 0;
1709   } else {
1710     /* NEXT, set the pointers to the triangular factors */
1711     C->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1712     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtr          = 0;
1713     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtr          = 0;
1714     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtrTranspose = 0;
1715     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtrTranspose = 0;
1716     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->rpermIndices            = 0;
1717     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->cpermIndices            = 0;
1718     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->workVector              = 0;
1719     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle                  = 0;
1720     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1721     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle                  = handle;
1722     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->nnz                     = 0;
1723   }
1724 
1725   C->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
1726   C->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
1727   C->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
1728   C->ops->mult             = MatMult_SeqAIJCUSPARSE;
1729   C->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1730   C->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1731   C->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1732   C->ops->duplicate        = MatDuplicate_SeqAIJCUSPARSE;
1733 
1734   ierr = PetscObjectChangeTypeName((PetscObject)C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1735 
1736   C->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED;
1737 
1738   ierr = PetscObjectComposeFunction((PetscObject)C, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
1739   PetscFunctionReturn(0);
1740 }
1741 
1742 PETSC_EXTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat B)
1743 {
1744   PetscErrorCode ierr;
1745   cusparseStatus_t stat;
1746   cusparseHandle_t handle=0;
1747 
1748   PetscFunctionBegin;
1749   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
1750   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
1751 
1752   if (B->factortype==MAT_FACTOR_NONE) {
1753     /* you cannot check the inode.use flag here since the matrix was just created.
1754        now build a GPU matrix data structure */
1755     B->spptr = new Mat_SeqAIJCUSPARSE;
1756     ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat          = 0;
1757     ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0;
1758     ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector   = 0;
1759     ((Mat_SeqAIJCUSPARSE*)B->spptr)->format       = MAT_CUSPARSE_CSR;
1760     ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream       = 0;
1761     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1762     ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle       = handle;
1763     ((Mat_SeqAIJCUSPARSE*)B->spptr)->nonzerostate = 0;
1764   } else {
1765     /* NEXT, set the pointers to the triangular factors */
1766     B->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1767     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr          = 0;
1768     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr          = 0;
1769     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0;
1770     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0;
1771     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices            = 0;
1772     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices            = 0;
1773     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector              = 0;
1774     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1775     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle                  = handle;
1776     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz                     = 0;
1777   }
1778 
1779   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
1780   B->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
1781   B->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
1782   B->ops->mult             = MatMult_SeqAIJCUSPARSE;
1783   B->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1784   B->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1785   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1786   B->ops->duplicate        = MatDuplicate_SeqAIJCUSPARSE;
1787 
1788   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1789 
1790   B->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED;
1791 
1792   ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
1793   PetscFunctionReturn(0);
1794 }
1795 
1796 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
1797 {
1798   PetscErrorCode ierr;
1799 
1800   PetscFunctionBegin;
1801   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
1802   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B);CHKERRQ(ierr);
1803   PetscFunctionReturn(0);
1804 }
1805 
1806 /*MC
1807    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
1808 
1809    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
1810    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
1811    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
1812 
1813    Options Database Keys:
1814 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
1815 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1816 -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1817 
1818   Level: beginner
1819 
1820 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
1821 M*/
1822 
1823 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);
1824 
1825 
1826 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
1827 {
1828   PetscErrorCode ierr;
1829 
1830   PetscFunctionBegin;
1831   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1832   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1833   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1834   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1835   PetscFunctionReturn(0);
1836 }
1837 
1838 
1839 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
1840 {
1841   cusparseStatus_t stat;
1842   cusparseHandle_t handle;
1843 
1844   PetscFunctionBegin;
1845   if (*cusparsestruct) {
1846     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);
1847     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);
1848     delete (*cusparsestruct)->workVector;
1849     if (handle = (*cusparsestruct)->handle) {
1850       stat = cusparseDestroy(handle);CHKERRCUDA(stat);
1851     }
1852     delete *cusparsestruct;
1853     *cusparsestruct = 0;
1854   }
1855   PetscFunctionReturn(0);
1856 }
1857 
1858 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
1859 {
1860   PetscFunctionBegin;
1861   if (*mat) {
1862     delete (*mat)->values;
1863     delete (*mat)->column_indices;
1864     delete (*mat)->row_offsets;
1865     delete *mat;
1866     *mat = 0;
1867   }
1868   PetscFunctionReturn(0);
1869 }
1870 
1871 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
1872 {
1873   cusparseStatus_t stat;
1874   PetscErrorCode   ierr;
1875 
1876   PetscFunctionBegin;
1877   if (*trifactor) {
1878     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUDA(stat); }
1879     if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUDA(stat); }
1880     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
1881     delete *trifactor;
1882     *trifactor = 0;
1883   }
1884   PetscFunctionReturn(0);
1885 }
1886 
1887 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
1888 {
1889   CsrMatrix        *mat;
1890   cusparseStatus_t stat;
1891   cudaError_t      err;
1892 
1893   PetscFunctionBegin;
1894   if (*matstruct) {
1895     if ((*matstruct)->mat) {
1896       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
1897         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
1898         stat = cusparseDestroyHybMat(hybMat);CHKERRCUDA(stat);
1899       } else {
1900         mat = (CsrMatrix*)(*matstruct)->mat;
1901         CsrMatrix_Destroy(&mat);
1902       }
1903     }
1904     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUDA(stat); }
1905     delete (*matstruct)->cprowIndices;
1906     if ((*matstruct)->alpha)     { err=cudaFree((*matstruct)->alpha);CHKERRCUDA(err); }
1907     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
1908     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
1909     delete *matstruct;
1910     *matstruct = 0;
1911   }
1912   PetscFunctionReturn(0);
1913 }
1914 
1915 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
1916 {
1917   cusparseHandle_t handle;
1918   cusparseStatus_t stat;
1919 
1920   PetscFunctionBegin;
1921   if (*trifactors) {
1922     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);
1923     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);
1924     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);
1925     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);
1926     delete (*trifactors)->rpermIndices;
1927     delete (*trifactors)->cpermIndices;
1928     delete (*trifactors)->workVector;
1929     if (handle = (*trifactors)->handle) {
1930       stat = cusparseDestroy(handle);CHKERRCUDA(stat);
1931     }
1932     delete *trifactors;
1933     *trifactors = 0;
1934   }
1935   PetscFunctionReturn(0);
1936 }
1937 
1938