xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 1f0d623c8336219eb98f7ded6f95c151ca603fe7)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 
6 #include <petscconf.h>
7 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
8 #include <../src/mat/impls/sbaij/seq/sbaij.h>
9 #include <../src/vec/vec/impls/dvecimpl.h>
10 #include <petsc-private/vecimpl.h>
11 #undef VecType
12 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
13 
14 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
15 
16 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
17 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
18 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
19 
20 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
21 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
22 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
23 
24 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
25 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
26 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
27 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
28 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat);
29 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
30 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
31 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
32 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
33 
34 #undef __FUNCT__
35 #define __FUNCT__ "MatCUSPARSESetStream"
36 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
37 {
38   cusparseStatus_t   stat;
39   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
40 
41   PetscFunctionBegin;
42   cusparsestruct->stream = stream;
43   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSP(stat);
44   PetscFunctionReturn(0);
45 }
46 
47 #undef __FUNCT__
48 #define __FUNCT__ "MatCUSPARSESetHandle"
49 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
50 {
51   cusparseStatus_t   stat;
52   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
53 
54   PetscFunctionBegin;
55   if (cusparsestruct->handle)
56     stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSP(stat);
57   cusparsestruct->handle = handle;
58   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSP(stat);
59   PetscFunctionReturn(0);
60 }
61 
62 #undef __FUNCT__
63 #define __FUNCT__ "MatCUSPARSEClearHandle"
64 PetscErrorCode MatCUSPARSEClearHandle(Mat A)
65 {
66   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
67   PetscFunctionBegin;
68   if (cusparsestruct->handle)
69     cusparsestruct->handle = 0;
70   PetscFunctionReturn(0);
71 }
72 
73 #undef __FUNCT__
74 #define __FUNCT__ "MatFactorGetSolverPackage_seqaij_cusparse"
75 PetscErrorCode MatFactorGetSolverPackage_seqaij_cusparse(Mat A,const MatSolverPackage *type)
76 {
77   PetscFunctionBegin;
78   *type = MATSOLVERCUSPARSE;
79   PetscFunctionReturn(0);
80 }
81 
82 /*MC
83   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
84   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
85   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
86   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
87   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
88   algorithms are not recommended. This class does NOT support direct solver operations.
89 
90   Level: beginner
91 
92 .seealso: PCFactorSetMatSolverPackage(), MatSolverPackage, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
93 M*/
94 
95 #undef __FUNCT__
96 #define __FUNCT__ "MatGetFactor_seqaij_cusparse"
97 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaij_cusparse(Mat A,MatFactorType ftype,Mat *B)
98 {
99   PetscErrorCode ierr;
100   PetscInt       n = A->rmap->n;
101 
102   PetscFunctionBegin;
103   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
104   (*B)->factortype = ftype;
105   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
106   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
107 
108   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
109     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
110     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
111     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
112   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
113     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
114     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
115   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
116 
117   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
118   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverPackage_C",MatFactorGetSolverPackage_seqaij_cusparse);CHKERRQ(ierr);
119   PetscFunctionReturn(0);
120 }
121 
122 #undef __FUNCT__
123 #define __FUNCT__ "MatCUSPARSESetFormat_SeqAIJCUSPARSE"
124 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
125 {
126   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
127 
128   PetscFunctionBegin;
129 #if CUDA_VERSION>=4020
130   switch (op) {
131   case MAT_CUSPARSE_MULT:
132     cusparsestruct->format = format;
133     break;
134   case MAT_CUSPARSE_ALL:
135     cusparsestruct->format = format;
136     break;
137   default:
138     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
139   }
140 #else
141   if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB)
142     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format require CUDA 4.2 or later.");
143 #endif
144   PetscFunctionReturn(0);
145 }
146 
147 /*@
148    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
149    operation. Only the MatMult operation can use different GPU storage formats
150    for MPIAIJCUSPARSE matrices.
151    Not Collective
152 
153    Input Parameters:
154 +  A - Matrix of type SEQAIJCUSPARSE
155 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
156 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
157 
158    Output Parameter:
159 
160    Level: intermediate
161 
162 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
163 @*/
164 #undef __FUNCT__
165 #define __FUNCT__ "MatCUSPARSESetFormat"
166 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
167 {
168   PetscErrorCode ierr;
169 
170   PetscFunctionBegin;
171   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
172   ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
173   PetscFunctionReturn(0);
174 }
175 
176 #undef __FUNCT__
177 #define __FUNCT__ "MatSetFromOptions_SeqAIJCUSPARSE"
178 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A)
179 {
180   PetscErrorCode           ierr;
181   MatCUSPARSEStorageFormat format;
182   PetscBool                flg;
183   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
184 
185   PetscFunctionBegin;
186   ierr = PetscOptionsHead("SeqAIJCUSPARSE options");CHKERRQ(ierr);
187   ierr = PetscObjectOptionsBegin((PetscObject)A);
188   if (A->factortype==MAT_FACTOR_NONE) {
189     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
190                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
191     if (flg) {
192       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);
193     }
194   }
195   ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
196                           "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
197   if (flg) {
198     ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
199   }
200   ierr = PetscOptionsEnd();CHKERRQ(ierr);
201   PetscFunctionReturn(0);
202 
203 }
204 
205 #undef __FUNCT__
206 #define __FUNCT__ "MatILUFactorSymbolic_SeqAIJCUSPARSE"
207 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
208 {
209   PetscErrorCode ierr;
210 
211   PetscFunctionBegin;
212   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
213   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
214   PetscFunctionReturn(0);
215 }
216 
217 #undef __FUNCT__
218 #define __FUNCT__ "MatLUFactorSymbolic_SeqAIJCUSPARSE"
219 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
220 {
221   PetscErrorCode ierr;
222 
223   PetscFunctionBegin;
224   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
225   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
226   PetscFunctionReturn(0);
227 }
228 
229 #undef __FUNCT__
230 #define __FUNCT__ "MatICCFactorSymbolic_SeqAIJCUSPARSE"
231 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
232 {
233   PetscErrorCode ierr;
234 
235   PetscFunctionBegin;
236   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
237   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
238   PetscFunctionReturn(0);
239 }
240 
241 #undef __FUNCT__
242 #define __FUNCT__ "MatCholeskyFactorSymbolic_SeqAIJCUSPARSE"
243 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
244 {
245   PetscErrorCode ierr;
246 
247   PetscFunctionBegin;
248   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
249   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
250   PetscFunctionReturn(0);
251 }
252 
253 #undef __FUNCT__
254 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildILULowerTriMatrix"
255 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
256 {
257   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
258   PetscInt                          n = A->rmap->n;
259   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
260   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
261   cusparseStatus_t                  stat;
262   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
263   const MatScalar                   *aa = a->a,*v;
264   PetscInt                          *AiLo, *AjLo;
265   PetscScalar                       *AALo;
266   PetscInt                          i,nz, nzLower, offset, rowOffset;
267   PetscErrorCode                    ierr;
268 
269   PetscFunctionBegin;
270   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
271     try {
272       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
273       nzLower=n+ai[n]-ai[1];
274 
275       /* Allocate Space for the lower triangular matrix */
276       ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
277       ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUSP(ierr);
278       ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUSP(ierr);
279 
280       /* Fill the lower triangular matrix */
281       AiLo[0]  = (PetscInt) 0;
282       AiLo[n]  = nzLower;
283       AjLo[0]  = (PetscInt) 0;
284       AALo[0]  = (MatScalar) 1.0;
285       v        = aa;
286       vi       = aj;
287       offset   = 1;
288       rowOffset= 1;
289       for (i=1; i<n; i++) {
290         nz = ai[i+1] - ai[i];
291         /* additional 1 for the term on the diagonal */
292         AiLo[i]    = rowOffset;
293         rowOffset += nz+1;
294 
295         ierr = PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr);
296         ierr = PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
297 
298         offset      += nz;
299         AjLo[offset] = (PetscInt) i;
300         AALo[offset] = (MatScalar) 1.0;
301         offset      += 1;
302 
303         v  += nz;
304         vi += nz;
305       }
306 
307       /* allocate space for the triangular factor information */
308       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
309 
310       /* Create the matrix description */
311       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSP(stat);
312       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat);
313       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat);
314       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSP(stat);
315       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSP(stat);
316 
317       /* Create the solve analysis information */
318       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSP(stat);
319 
320       /* set the operation */
321       loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
322 
323       /* set the matrix */
324       loTriFactor->csrMat = new CsrMatrix;
325       loTriFactor->csrMat->num_rows = n;
326       loTriFactor->csrMat->num_cols = n;
327       loTriFactor->csrMat->num_entries = nzLower;
328 
329       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
330       loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
331 
332       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
333       loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
334 
335       loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
336       loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
337 
338       /* perform the solve analysis */
339       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
340                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
341                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
342                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSP(stat);
343 
344       /* assign the pointer. Is this really necessary? */
345       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
346 
347       ierr = cudaFreeHost(AiLo);CHKERRCUSP(ierr);
348       ierr = cudaFreeHost(AjLo);CHKERRCUSP(ierr);
349       ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr);
350     } catch(char *ex) {
351       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
352     }
353   }
354   PetscFunctionReturn(0);
355 }
356 
357 #undef __FUNCT__
358 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildILUUpperTriMatrix"
359 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
360 {
361   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
362   PetscInt                          n = A->rmap->n;
363   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
364   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
365   cusparseStatus_t                  stat;
366   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
367   const MatScalar                   *aa = a->a,*v;
368   PetscInt                          *AiUp, *AjUp;
369   PetscScalar                       *AAUp;
370   PetscInt                          i,nz, nzUpper, offset;
371   PetscErrorCode                    ierr;
372 
373   PetscFunctionBegin;
374   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
375     try {
376       /* next, figure out the number of nonzeros in the upper triangular matrix. */
377       nzUpper = adiag[0]-adiag[n];
378 
379       /* Allocate Space for the upper triangular matrix */
380       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
381       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr);
382       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr);
383 
384       /* Fill the upper triangular matrix */
385       AiUp[0]=(PetscInt) 0;
386       AiUp[n]=nzUpper;
387       offset = nzUpper;
388       for (i=n-1; i>=0; i--) {
389         v  = aa + adiag[i+1] + 1;
390         vi = aj + adiag[i+1] + 1;
391 
392         /* number of elements NOT on the diagonal */
393         nz = adiag[i] - adiag[i+1]-1;
394 
395         /* decrement the offset */
396         offset -= (nz+1);
397 
398         /* first, set the diagonal elements */
399         AjUp[offset] = (PetscInt) i;
400         AAUp[offset] = 1./v[nz];
401         AiUp[i]      = AiUp[i+1] - (nz+1);
402 
403         ierr = PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr);
404         ierr = PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
405       }
406 
407       /* allocate space for the triangular factor information */
408       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
409 
410       /* Create the matrix description */
411       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSP(stat);
412       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat);
413       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat);
414       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat);
415       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat);
416 
417       /* Create the solve analysis information */
418       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSP(stat);
419 
420       /* set the operation */
421       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
422 
423       /* set the matrix */
424       upTriFactor->csrMat = new CsrMatrix;
425       upTriFactor->csrMat->num_rows = n;
426       upTriFactor->csrMat->num_cols = n;
427       upTriFactor->csrMat->num_entries = nzUpper;
428 
429       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
430       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
431 
432       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
433       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
434 
435       upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
436       upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
437 
438       /* perform the solve analysis */
439       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
440                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
441                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
442                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSP(stat);
443 
444       /* assign the pointer. Is this really necessary? */
445       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
446 
447       ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr);
448       ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr);
449       ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr);
450     } catch(char *ex) {
451       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
452     }
453   }
454   PetscFunctionReturn(0);
455 }
456 
457 #undef __FUNCT__
458 #define __FUNCT__ "MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU"
459 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
460 {
461   PetscErrorCode               ierr;
462   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
463   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
464   IS                           isrow = a->row,iscol = a->icol;
465   PetscBool                    row_identity,col_identity;
466   const PetscInt               *r,*c;
467   PetscInt                     n = A->rmap->n;
468 
469   PetscFunctionBegin;
470   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
471   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
472 
473   cusparseTriFactors->workVector = new THRUSTARRAY;
474   cusparseTriFactors->workVector->resize(n);
475   cusparseTriFactors->nnz=a->nz;
476 
477   A->valid_GPU_matrix = PETSC_CUSP_BOTH;
478   /*lower triangular indices */
479   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
480   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
481   if (!row_identity) {
482     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
483     cusparseTriFactors->rpermIndices->assign(r, r+n);
484   }
485   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
486 
487   /*upper triangular indices */
488   ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
489   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
490   if (!col_identity) {
491     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
492     cusparseTriFactors->cpermIndices->assign(c, c+n);
493   }
494   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
495   PetscFunctionReturn(0);
496 }
497 
498 #undef __FUNCT__
499 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildICCTriMatrices"
500 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
501 {
502   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
503   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
504   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
505   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
506   cusparseStatus_t                  stat;
507   PetscErrorCode                    ierr;
508   PetscInt                          *AiUp, *AjUp;
509   PetscScalar                       *AAUp;
510   PetscScalar                       *AALo;
511   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
512   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
513   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
514   const MatScalar                   *aa = b->a,*v;
515 
516   PetscFunctionBegin;
517   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
518     try {
519       /* Allocate Space for the upper triangular matrix */
520       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
521       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr);
522       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr);
523       ierr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr);
524 
525       /* Fill the upper triangular matrix */
526       AiUp[0]=(PetscInt) 0;
527       AiUp[n]=nzUpper;
528       offset = 0;
529       for (i=0; i<n; i++) {
530         /* set the pointers */
531         v  = aa + ai[i];
532         vj = aj + ai[i];
533         nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
534 
535         /* first, set the diagonal elements */
536         AjUp[offset] = (PetscInt) i;
537         AAUp[offset] = 1.0/v[nz];
538         AiUp[i]      = offset;
539         AALo[offset] = 1.0/v[nz];
540 
541         offset+=1;
542         if (nz>0) {
543           ierr = PetscMemcpy(&(AjUp[offset]), vj, nz*sizeof(PetscInt));CHKERRQ(ierr);
544           ierr = PetscMemcpy(&(AAUp[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
545           for (j=offset; j<offset+nz; j++) {
546             AAUp[j] = -AAUp[j];
547             AALo[j] = AAUp[j]/v[nz];
548           }
549           offset+=nz;
550         }
551       }
552 
553       /* allocate space for the triangular factor information */
554       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
555 
556       /* Create the matrix description */
557       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSP(stat);
558       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat);
559       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat);
560       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat);
561       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSP(stat);
562 
563       /* Create the solve analysis information */
564       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSP(stat);
565 
566       /* set the operation */
567       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
568 
569       /* set the matrix */
570       upTriFactor->csrMat = new CsrMatrix;
571       upTriFactor->csrMat->num_rows = A->rmap->n;
572       upTriFactor->csrMat->num_cols = A->cmap->n;
573       upTriFactor->csrMat->num_entries = a->nz;
574 
575       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
576       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
577 
578       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
579       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
580 
581       upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
582       upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
583 
584       /* perform the solve analysis */
585       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
586                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
587                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
588                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSP(stat);
589 
590       /* assign the pointer. Is this really necessary? */
591       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
592 
593       /* allocate space for the triangular factor information */
594       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
595 
596       /* Create the matrix description */
597       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSP(stat);
598       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat);
599       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat);
600       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat);
601       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat);
602 
603       /* Create the solve analysis information */
604       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSP(stat);
605 
606       /* set the operation */
607       loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
608 
609       /* set the matrix */
610       loTriFactor->csrMat = new CsrMatrix;
611       loTriFactor->csrMat->num_rows = A->rmap->n;
612       loTriFactor->csrMat->num_cols = A->cmap->n;
613       loTriFactor->csrMat->num_entries = a->nz;
614 
615       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
616       loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
617 
618       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
619       loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
620 
621       loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
622       loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
623 
624       /* perform the solve analysis */
625       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
626                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
627                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
628                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSP(stat);
629 
630       /* assign the pointer. Is this really necessary? */
631       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
632 
633       A->valid_GPU_matrix = PETSC_CUSP_BOTH;
634       ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr);
635       ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr);
636       ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr);
637       ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr);
638     } catch(char *ex) {
639       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
640     }
641   }
642   PetscFunctionReturn(0);
643 }
644 
645 #undef __FUNCT__
646 #define __FUNCT__ "MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU"
647 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
648 {
649   PetscErrorCode               ierr;
650   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
651   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
652   IS                           ip = a->row;
653   const PetscInt               *rip;
654   PetscBool                    perm_identity;
655   PetscInt                     n = A->rmap->n;
656 
657   PetscFunctionBegin;
658   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
659   cusparseTriFactors->workVector = new THRUSTARRAY;
660   cusparseTriFactors->workVector->resize(n);
661   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
662 
663   /*lower triangular indices */
664   ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
665   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
666   if (!perm_identity) {
667     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
668     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
669     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
670     cusparseTriFactors->cpermIndices->assign(rip, rip+n);
671   }
672   ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
673   PetscFunctionReturn(0);
674 }
675 
676 #undef __FUNCT__
677 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJCUSPARSE"
678 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
679 {
680   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
681   IS             isrow = b->row,iscol = b->col;
682   PetscBool      row_identity,col_identity;
683   PetscErrorCode ierr;
684 
685   PetscFunctionBegin;
686   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
687   /* determine which version of MatSolve needs to be used. */
688   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
689   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
690   if (row_identity && col_identity) {
691     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
692     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
693   } else {
694     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
695     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
696   }
697 
698   /* get the triangular factors */
699   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
700   PetscFunctionReturn(0);
701 }
702 
703 #undef __FUNCT__
704 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqAIJCUSPARSE"
705 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
706 {
707   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
708   IS             ip = b->row;
709   PetscBool      perm_identity;
710   PetscErrorCode ierr;
711 
712   PetscFunctionBegin;
713   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
714 
715   /* determine which version of MatSolve needs to be used. */
716   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
717   if (perm_identity) {
718     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
719     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
720   } else {
721     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
722     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
723   }
724 
725   /* get the triangular factors */
726   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
727   PetscFunctionReturn(0);
728 }
729 
730 #undef __FUNCT__
731 #define __FUNCT__ "MatSeqAIJCUSPARSEAnalyzeTransposeForSolve"
732 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
733 {
734   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
735   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
736   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
737   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
738   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
739   cusparseStatus_t                  stat;
740   cusparseIndexBase_t               indexBase;
741   cusparseMatrixType_t              matrixType;
742   cusparseFillMode_t                fillMode;
743   cusparseDiagType_t                diagType;
744 
745   PetscFunctionBegin;
746 
747   /*********************************************/
748   /* Now the Transpose of the Lower Tri Factor */
749   /*********************************************/
750 
751   /* allocate space for the transpose of the lower triangular factor */
752   loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
753 
754   /* set the matrix descriptors of the lower triangular factor */
755   matrixType = cusparseGetMatType(loTriFactor->descr);
756   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
757   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
758     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
759   diagType = cusparseGetMatDiagType(loTriFactor->descr);
760 
761   /* Create the matrix description */
762   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSP(stat);
763   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSP(stat);
764   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSP(stat);
765   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSP(stat);
766   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSP(stat);
767 
768   /* Create the solve analysis information */
769   stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUSP(stat);
770 
771   /* set the operation */
772   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
773 
774   /* allocate GPU space for the CSC of the lower triangular factor*/
775   loTriFactorT->csrMat = new CsrMatrix;
776   loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows;
777   loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols;
778   loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
779   loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1);
780   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries);
781   loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries);
782 
783   /* compute the transpose of the lower triangular factor, i.e. the CSC */
784   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
785                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
786                           loTriFactor->csrMat->values->data().get(),
787                           loTriFactor->csrMat->row_offsets->data().get(),
788                           loTriFactor->csrMat->column_indices->data().get(),
789                           loTriFactorT->csrMat->values->data().get(),
790                           loTriFactorT->csrMat->column_indices->data().get(),
791                           loTriFactorT->csrMat->row_offsets->data().get(),
792                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat);
793 
794   /* perform the solve analysis on the transposed matrix */
795   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
796                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries,
797                            loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
798                            loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(),
799                            loTriFactorT->solveInfo);CHKERRCUSP(stat);
800 
801   /* assign the pointer. Is this really necessary? */
802   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
803 
804   /*********************************************/
805   /* Now the Transpose of the Upper Tri Factor */
806   /*********************************************/
807 
808   /* allocate space for the transpose of the upper triangular factor */
809   upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
810 
811   /* set the matrix descriptors of the upper triangular factor */
812   matrixType = cusparseGetMatType(upTriFactor->descr);
813   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
814   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
815     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
816   diagType = cusparseGetMatDiagType(upTriFactor->descr);
817 
818   /* Create the matrix description */
819   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSP(stat);
820   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSP(stat);
821   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSP(stat);
822   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSP(stat);
823   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSP(stat);
824 
825   /* Create the solve analysis information */
826   stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUSP(stat);
827 
828   /* set the operation */
829   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
830 
831   /* allocate GPU space for the CSC of the upper triangular factor*/
832   upTriFactorT->csrMat = new CsrMatrix;
833   upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows;
834   upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols;
835   upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
836   upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1);
837   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries);
838   upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries);
839 
840   /* compute the transpose of the upper triangular factor, i.e. the CSC */
841   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
842                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
843                           upTriFactor->csrMat->values->data().get(),
844                           upTriFactor->csrMat->row_offsets->data().get(),
845                           upTriFactor->csrMat->column_indices->data().get(),
846                           upTriFactorT->csrMat->values->data().get(),
847                           upTriFactorT->csrMat->column_indices->data().get(),
848                           upTriFactorT->csrMat->row_offsets->data().get(),
849                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat);
850 
851   /* perform the solve analysis on the transposed matrix */
852   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
853                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries,
854                            upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
855                            upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(),
856                            upTriFactorT->solveInfo);CHKERRCUSP(stat);
857 
858   /* assign the pointer. Is this really necessary? */
859   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
860   PetscFunctionReturn(0);
861 }
862 
863 #undef __FUNCT__
864 #define __FUNCT__ "MatSeqAIJCUSPARSEGenerateTransposeForMult"
865 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
866 {
867   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
868   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
869   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
870   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
871   cusparseStatus_t             stat;
872   cusparseIndexBase_t          indexBase;
873   cudaError_t                  err;
874 
875   PetscFunctionBegin;
876 
877   /* allocate space for the triangular factor information */
878   matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
879   stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSP(stat);
880   indexBase = cusparseGetMatIndexBase(matstruct->descr);
881   stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSP(stat);
882   stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSP(stat);
883 
884   /* set alpha and beta */
885   err = cudaMalloc((void **)&(matstructT->alpha),sizeof(PetscScalar));CHKERRCUSP(err);
886   err = cudaMemcpy(matstructT->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err);
887   err = cudaMalloc((void **)&(matstructT->beta),sizeof(PetscScalar));CHKERRCUSP(err);
888   err = cudaMemcpy(matstructT->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err);
889   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSP(stat);
890 
891   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
892     CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
893     CsrMatrix *matrixT= new CsrMatrix;
894     matrixT->num_rows = A->rmap->n;
895     matrixT->num_cols = A->cmap->n;
896     matrixT->num_entries = a->nz;
897     matrixT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
898     matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
899     matrixT->values = new THRUSTARRAY(a->nz);
900 
901     /* compute the transpose of the upper triangular factor, i.e. the CSC */
902     indexBase = cusparseGetMatIndexBase(matstruct->descr);
903     stat = cusparse_csr2csc(cusparsestruct->handle, matrix->num_rows,
904                             matrix->num_cols, matrix->num_entries,
905                             matrix->values->data().get(),
906                             matrix->row_offsets->data().get(),
907                             matrix->column_indices->data().get(),
908                             matrixT->values->data().get(),
909                             matrixT->column_indices->data().get(),
910                             matrixT->row_offsets->data().get(),
911                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat);
912 
913     /* assign the pointer */
914     matstructT->mat = matrixT;
915 
916   } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
917 #if CUDA_VERSION>=5000
918     /* First convert HYB to CSR */
919     CsrMatrix *temp= new CsrMatrix;
920     temp->num_rows = A->rmap->n;
921     temp->num_cols = A->cmap->n;
922     temp->num_entries = a->nz;
923     temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
924     temp->column_indices = new THRUSTINTARRAY32(a->nz);
925     temp->values = new THRUSTARRAY(a->nz);
926 
927 
928     stat = cusparse_hyb2csr(cusparsestruct->handle,
929                             matstruct->descr, (cusparseHybMat_t)matstruct->mat,
930                             temp->values->data().get(),
931                             temp->row_offsets->data().get(),
932                             temp->column_indices->data().get());CHKERRCUSP(stat);
933 
934     /* Next, convert CSR to CSC (i.e. the matrix transpose) */
935     CsrMatrix *tempT= new CsrMatrix;
936     tempT->num_rows = A->rmap->n;
937     tempT->num_cols = A->cmap->n;
938     tempT->num_entries = a->nz;
939     tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
940     tempT->column_indices = new THRUSTINTARRAY32(a->nz);
941     tempT->values = new THRUSTARRAY(a->nz);
942 
943     stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
944                             temp->num_cols, temp->num_entries,
945                             temp->values->data().get(),
946                             temp->row_offsets->data().get(),
947                             temp->column_indices->data().get(),
948                             tempT->values->data().get(),
949                             tempT->column_indices->data().get(),
950                             tempT->row_offsets->data().get(),
951                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat);
952 
953     /* Last, convert CSC to HYB */
954     cusparseHybMat_t hybMat;
955     stat = cusparseCreateHybMat(&hybMat);CHKERRCUSP(stat);
956     cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
957       CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
958     stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
959                             matstructT->descr, tempT->values->data().get(),
960                             tempT->row_offsets->data().get(),
961                             tempT->column_indices->data().get(),
962                             hybMat, 0, partition);CHKERRCUSP(stat);
963 
964     /* assign the pointer */
965     matstructT->mat = hybMat;
966 
967     /* delete temporaries */
968     if (tempT) {
969       if (tempT->values) delete (THRUSTARRAY*) tempT->values;
970       if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
971       if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
972       delete (CsrMatrix*) tempT;
973     }
974     if (temp) {
975       if (temp->values) delete (THRUSTARRAY*) temp->values;
976       if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
977       if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
978       delete (CsrMatrix*) temp;
979     }
980 #else
981     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format for the Matrix Transpose (in MatMultTranspose) require CUDA 5.0 or later.");
982 #endif
983   }
984   /* assign the compressed row indices */
985   matstructT->cprowIndices = new THRUSTINTARRAY;
986 
987   /* assign the pointer */
988   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
989   PetscFunctionReturn(0);
990 }
991 
992 #undef __FUNCT__
993 #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE"
994 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
995 {
996   CUSPARRAY                         *xGPU, *bGPU;
997   cusparseStatus_t                  stat;
998   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
999   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1000   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1001   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1002   PetscErrorCode                    ierr;
1003 
1004   PetscFunctionBegin;
1005   /* Analyze the matrix and create the transpose ... on the fly */
1006   if (!loTriFactorT && !upTriFactorT) {
1007     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1008     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1009     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1010   }
1011 
1012   /* Get the GPU pointers */
1013   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1014   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
1015 
1016   /* First, reorder with the row permutation */
1017   thrust::copy(thrust::make_permutation_iterator(bGPU->begin(), cusparseTriFactors->rpermIndices->begin()),
1018                thrust::make_permutation_iterator(bGPU->end(), cusparseTriFactors->rpermIndices->end()),
1019                xGPU->begin());
1020 
1021   /* First, solve U */
1022   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1023                         upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr,
1024                         upTriFactorT->csrMat->values->data().get(),
1025                         upTriFactorT->csrMat->row_offsets->data().get(),
1026                         upTriFactorT->csrMat->column_indices->data().get(),
1027                         upTriFactorT->solveInfo,
1028                         xGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat);
1029 
1030   /* Then, solve L */
1031   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1032                         loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr,
1033                         loTriFactorT->csrMat->values->data().get(),
1034                         loTriFactorT->csrMat->row_offsets->data().get(),
1035                         loTriFactorT->csrMat->column_indices->data().get(),
1036                         loTriFactorT->solveInfo,
1037                         tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat);
1038 
1039   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1040   thrust::copy(thrust::make_permutation_iterator(xGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1041                thrust::make_permutation_iterator(xGPU->end(), cusparseTriFactors->cpermIndices->end()),
1042                tempGPU->begin());
1043 
1044   /* Copy the temporary to the full solution. */
1045   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU->begin());
1046 
1047   /* restore */
1048   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
1049   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1050   ierr = WaitForGPU();CHKERRCUSP(ierr);
1051 
1052   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1053   PetscFunctionReturn(0);
1054 }
1055 
1056 #undef __FUNCT__
1057 #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering"
1058 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1059 {
1060   CUSPARRAY                         *xGPU,*bGPU;
1061   cusparseStatus_t                  stat;
1062   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1063   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1064   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1065   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1066   PetscErrorCode                    ierr;
1067 
1068   PetscFunctionBegin;
1069   /* Analyze the matrix and create the transpose ... on the fly */
1070   if (!loTriFactorT && !upTriFactorT) {
1071     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1072     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1073     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1074   }
1075 
1076   /* Get the GPU pointers */
1077   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1078   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
1079 
1080   /* First, solve U */
1081   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1082                         upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr,
1083                         upTriFactorT->csrMat->values->data().get(),
1084                         upTriFactorT->csrMat->row_offsets->data().get(),
1085                         upTriFactorT->csrMat->column_indices->data().get(),
1086                         upTriFactorT->solveInfo,
1087                         bGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat);
1088 
1089   /* Then, solve L */
1090   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1091                         loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr,
1092                         loTriFactorT->csrMat->values->data().get(),
1093                         loTriFactorT->csrMat->row_offsets->data().get(),
1094                         loTriFactorT->csrMat->column_indices->data().get(),
1095                         loTriFactorT->solveInfo,
1096                         tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat);
1097 
1098   /* restore */
1099   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
1100   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1101   ierr = WaitForGPU();CHKERRCUSP(ierr);
1102   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1103   PetscFunctionReturn(0);
1104 }
1105 
1106 #undef __FUNCT__
1107 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE"
1108 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1109 {
1110   CUSPARRAY                         *xGPU,*bGPU;
1111   cusparseStatus_t                  stat;
1112   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1113   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1114   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1115   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1116   PetscErrorCode                    ierr;
1117   VecType                           t;
1118   PetscBool                         flg;
1119 
1120   PetscFunctionBegin;
1121   ierr = VecGetType(bb,&t);CHKERRQ(ierr);
1122   ierr = PetscStrcmp(t,VECSEQCUSP,&flg);CHKERRQ(ierr);
1123   if (!flg) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Vector of type %s passed into MatSolve_SeqAIJCUSPARSE (Arg #2). Can only deal with %s\n.",t,VECSEQCUSP);
1124   ierr = VecGetType(xx,&t);CHKERRQ(ierr);
1125   ierr = PetscStrcmp(t,VECSEQCUSP,&flg);CHKERRQ(ierr);
1126   if (!flg) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Vector of type %s passed into MatSolve_SeqAIJCUSPARSE (Arg #3). Can only deal with %s\n.",t,VECSEQCUSP);
1127 
1128   /* Get the GPU pointers */
1129   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1130   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
1131 
1132   /* First, reorder with the row permutation */
1133   thrust::copy(thrust::make_permutation_iterator(bGPU->begin(), cusparseTriFactors->rpermIndices->begin()),
1134                thrust::make_permutation_iterator(bGPU->end(), cusparseTriFactors->rpermIndices->end()),
1135                xGPU->begin());
1136 
1137   /* Next, solve L */
1138   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1139                         loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr,
1140                         loTriFactor->csrMat->values->data().get(),
1141                         loTriFactor->csrMat->row_offsets->data().get(),
1142                         loTriFactor->csrMat->column_indices->data().get(),
1143                         loTriFactor->solveInfo,
1144                         xGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat);
1145 
1146   /* Then, solve U */
1147   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1148                         upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr,
1149                         upTriFactor->csrMat->values->data().get(),
1150                         upTriFactor->csrMat->row_offsets->data().get(),
1151                         upTriFactor->csrMat->column_indices->data().get(),
1152                         upTriFactor->solveInfo,
1153                         tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat);
1154 
1155   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1156   thrust::copy(thrust::make_permutation_iterator(xGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1157                thrust::make_permutation_iterator(xGPU->end(), cusparseTriFactors->cpermIndices->end()),
1158                tempGPU->begin());
1159 
1160   /* Copy the temporary to the full solution. */
1161   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU->begin());
1162 
1163   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
1164   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1165   ierr = WaitForGPU();CHKERRCUSP(ierr);
1166   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1167   PetscFunctionReturn(0);
1168 }
1169 
1170 #undef __FUNCT__
1171 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE_NaturalOrdering"
1172 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1173 {
1174   CUSPARRAY                         *xGPU,*bGPU;
1175   cusparseStatus_t                  stat;
1176   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1177   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1178   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1179   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1180   PetscErrorCode                    ierr;
1181 
1182   PetscFunctionBegin;
1183   /* Get the GPU pointers */
1184   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1185   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
1186 
1187   /* First, solve L */
1188   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1189                         loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr,
1190                         loTriFactor->csrMat->values->data().get(),
1191                         loTriFactor->csrMat->row_offsets->data().get(),
1192                         loTriFactor->csrMat->column_indices->data().get(),
1193                         loTriFactor->solveInfo,
1194                         bGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat);
1195 
1196   /* Next, solve U */
1197   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1198                         upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr,
1199                         upTriFactor->csrMat->values->data().get(),
1200                         upTriFactor->csrMat->row_offsets->data().get(),
1201                         upTriFactor->csrMat->column_indices->data().get(),
1202                         upTriFactor->solveInfo,
1203                         tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat);
1204 
1205   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
1206   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1207   ierr = WaitForGPU();CHKERRCUSP(ierr);
1208   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1209   PetscFunctionReturn(0);
1210 }
1211 
1212 #undef __FUNCT__
1213 #define __FUNCT__ "MatSeqAIJCUSPARSECopyToGPU"
1214 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1215 {
1216 
1217   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1218   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1219   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1220   PetscInt                     m = A->rmap->n,*ii,*ridx;
1221   PetscErrorCode               ierr;
1222   cusparseStatus_t             stat;
1223   cudaError_t                  err;
1224 
1225   PetscFunctionBegin;
1226   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
1227     ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1228     if (matstruct) {
1229       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Data structure should not be initialized.");
1230     }
1231     try {
1232       cusparsestruct->nonzerorow=0;
1233       for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0);
1234 
1235       if (a->compressedrow.use) {
1236         m    = a->compressedrow.nrows;
1237         ii   = a->compressedrow.i;
1238         ridx = a->compressedrow.rindex;
1239       } else {
1240         /* Forcing compressed row on the GPU */
1241         int k=0;
1242         ierr = PetscMalloc1((cusparsestruct->nonzerorow+1), &ii);CHKERRQ(ierr);
1243         ierr = PetscMalloc1((cusparsestruct->nonzerorow), &ridx);CHKERRQ(ierr);
1244         ii[0]=0;
1245         for (int j = 0; j<m; j++) {
1246           if ((a->i[j+1]-a->i[j])>0) {
1247             ii[k]  = a->i[j];
1248             ridx[k]= j;
1249             k++;
1250           }
1251         }
1252         ii[cusparsestruct->nonzerorow] = a->nz;
1253         m = cusparsestruct->nonzerorow;
1254       }
1255 
1256       /* allocate space for the triangular factor information */
1257       matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1258       stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSP(stat);
1259       stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat);
1260       stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSP(stat);
1261 
1262       err = cudaMalloc((void **)&(matstruct->alpha),sizeof(PetscScalar));CHKERRCUSP(err);
1263       err = cudaMemcpy(matstruct->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err);
1264       err = cudaMalloc((void **)&(matstruct->beta),sizeof(PetscScalar));CHKERRCUSP(err);
1265       err = cudaMemcpy(matstruct->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err);
1266       stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSP(stat);
1267 
1268       /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1269       if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1270 /* set the matrix */
1271         CsrMatrix *matrix= new CsrMatrix;
1272         matrix->num_rows = m;
1273         matrix->num_cols = A->cmap->n;
1274         matrix->num_entries = a->nz;
1275         matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1276         matrix->row_offsets->assign(ii, ii + m+1);
1277 
1278         matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1279         matrix->column_indices->assign(a->j, a->j+a->nz);
1280 
1281         matrix->values = new THRUSTARRAY(a->nz);
1282         matrix->values->assign(a->a, a->a+a->nz);
1283 
1284 /* assign the pointer */
1285         matstruct->mat = matrix;
1286 
1287       } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1288 #if CUDA_VERSION>=4020
1289         CsrMatrix *matrix= new CsrMatrix;
1290         matrix->num_rows = m;
1291         matrix->num_cols = A->cmap->n;
1292         matrix->num_entries = a->nz;
1293         matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1294         matrix->row_offsets->assign(ii, ii + m+1);
1295 
1296         matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1297         matrix->column_indices->assign(a->j, a->j+a->nz);
1298 
1299         matrix->values = new THRUSTARRAY(a->nz);
1300         matrix->values->assign(a->a, a->a+a->nz);
1301 
1302         cusparseHybMat_t hybMat;
1303         stat = cusparseCreateHybMat(&hybMat);CHKERRCUSP(stat);
1304         cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1305           CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1306         stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols,
1307                                 matstruct->descr, matrix->values->data().get(),
1308                                 matrix->row_offsets->data().get(),
1309                                 matrix->column_indices->data().get(),
1310                                 hybMat, 0, partition);CHKERRCUSP(stat);
1311         /* assign the pointer */
1312         matstruct->mat = hybMat;
1313 
1314         if (matrix) {
1315           if (matrix->values) delete (THRUSTARRAY*)matrix->values;
1316           if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices;
1317           if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets;
1318           delete (CsrMatrix*)matrix;
1319         }
1320 #endif
1321       }
1322 
1323       /* assign the compressed row indices */
1324       matstruct->cprowIndices = new THRUSTINTARRAY(m);
1325       matstruct->cprowIndices->assign(ridx,ridx+m);
1326 
1327       /* assign the pointer */
1328       cusparsestruct->mat = matstruct;
1329 
1330       if (!a->compressedrow.use) {
1331         ierr = PetscFree(ii);CHKERRQ(ierr);
1332         ierr = PetscFree(ridx);CHKERRQ(ierr);
1333       }
1334       cusparsestruct->workVector = new THRUSTARRAY;
1335       cusparsestruct->workVector->resize(m);
1336     } catch(char *ex) {
1337       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1338     }
1339     ierr = WaitForGPU();CHKERRCUSP(ierr);
1340 
1341     A->valid_GPU_matrix = PETSC_CUSP_BOTH;
1342 
1343     ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1344   }
1345   PetscFunctionReturn(0);
1346 }
1347 
1348 #undef __FUNCT__
1349 #define __FUNCT__ "MatCreateVecs_SeqAIJCUSPARSE"
1350 static PetscErrorCode MatCreateVecs_SeqAIJCUSPARSE(Mat mat, Vec *right, Vec *left)
1351 {
1352   PetscErrorCode ierr;
1353   PetscInt rbs,cbs;
1354 
1355   PetscFunctionBegin;
1356   ierr = MatGetBlockSizes(mat,&rbs,&cbs);CHKERRQ(ierr);
1357   if (right) {
1358     ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr);
1359     ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
1360     ierr = VecSetBlockSize(*right,cbs);CHKERRQ(ierr);
1361     ierr = VecSetType(*right,VECSEQCUSP);CHKERRQ(ierr);
1362     ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr);
1363   }
1364   if (left) {
1365     ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr);
1366     ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
1367     ierr = VecSetBlockSize(*left,rbs);CHKERRQ(ierr);
1368     ierr = VecSetType(*left,VECSEQCUSP);CHKERRQ(ierr);
1369     ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr);
1370   }
1371   PetscFunctionReturn(0);
1372 }
1373 
1374 struct VecCUSPPlusEquals
1375 {
1376   template <typename Tuple>
1377   __host__ __device__
1378   void operator()(Tuple t)
1379   {
1380     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1381   }
1382 };
1383 
1384 #undef __FUNCT__
1385 #define __FUNCT__ "MatMult_SeqAIJCUSPARSE"
1386 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1387 {
1388   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1389   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1390   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1391   CUSPARRAY                    *xarray,*yarray;
1392   PetscErrorCode               ierr;
1393   cusparseStatus_t             stat;
1394 
1395   PetscFunctionBegin;
1396   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
1397      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
1398   ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1399   ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
1400   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1401     CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1402     stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1403                              mat->num_rows, mat->num_cols, mat->num_entries,
1404                              matstruct->alpha, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(),
1405                              mat->column_indices->data().get(), xarray->data().get(), matstruct->beta,
1406                              yarray->data().get());CHKERRCUSP(stat);
1407   } else {
1408 #if CUDA_VERSION>=4020
1409     cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1410     stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1411                              matstruct->alpha, matstruct->descr, hybMat,
1412                              xarray->data().get(), matstruct->beta,
1413                              yarray->data().get());CHKERRCUSP(stat);
1414 #endif
1415   }
1416   ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1417   ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
1418   if (!cusparsestruct->stream) {
1419     ierr = WaitForGPU();CHKERRCUSP(ierr);
1420   }
1421   ierr = PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr);
1422   PetscFunctionReturn(0);
1423 }
1424 
1425 #undef __FUNCT__
1426 #define __FUNCT__ "MatMultTranspose_SeqAIJCUSPARSE"
1427 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1428 {
1429   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1430   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1431   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1432   CUSPARRAY                    *xarray,*yarray;
1433   PetscErrorCode               ierr;
1434   cusparseStatus_t             stat;
1435 
1436   PetscFunctionBegin;
1437   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
1438      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
1439   if (!matstructT) {
1440     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1441     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1442   }
1443   ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1444   ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
1445 
1446   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1447     CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1448     stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1449                              mat->num_rows, mat->num_cols,
1450                              mat->num_entries, matstructT->alpha, matstructT->descr,
1451                              mat->values->data().get(), mat->row_offsets->data().get(),
1452                              mat->column_indices->data().get(), xarray->data().get(), matstructT->beta,
1453                              yarray->data().get());CHKERRCUSP(stat);
1454   } else {
1455 #if CUDA_VERSION>=4020
1456     cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1457     stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1458                              matstructT->alpha, matstructT->descr, hybMat,
1459                              xarray->data().get(), matstructT->beta,
1460                              yarray->data().get());CHKERRCUSP(stat);
1461 #endif
1462   }
1463   ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1464   ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
1465   if (!cusparsestruct->stream) {
1466     ierr = WaitForGPU();CHKERRCUSP(ierr);
1467   }
1468   ierr = PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr);
1469   PetscFunctionReturn(0);
1470 }
1471 
1472 
1473 #undef __FUNCT__
1474 #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE"
1475 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1476 {
1477   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1478   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1479   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1480   CUSPARRAY                    *xarray,*yarray,*zarray;
1481   PetscErrorCode               ierr;
1482   cusparseStatus_t             stat;
1483 
1484   PetscFunctionBegin;
1485   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
1486      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
1487   try {
1488     ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr);
1489     ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1490     ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr);
1491     ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
1492 
1493     /* multiply add */
1494     if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1495       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1496     /* here we need to be careful to set the number of rows in the multiply to the
1497        number of compressed rows in the matrix ... which is equivalent to the
1498        size of the workVector */
1499       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1500                                mat->num_rows, mat->num_cols,
1501                                mat->num_entries, matstruct->alpha, matstruct->descr,
1502                                mat->values->data().get(), mat->row_offsets->data().get(),
1503                                mat->column_indices->data().get(), xarray->data().get(), matstruct->beta,
1504                                cusparsestruct->workVector->data().get());CHKERRCUSP(stat);
1505     } else {
1506 #if CUDA_VERSION>=4020
1507       cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1508       if (cusparsestruct->workVector->size()) {
1509         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1510             matstruct->alpha, matstruct->descr, hybMat,
1511             xarray->data().get(), matstruct->beta,
1512             cusparsestruct->workVector->data().get());CHKERRCUSP(stat);
1513       }
1514 #endif
1515     }
1516 
1517     /* scatter the data from the temporary into the full vector with a += operation */
1518     thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstruct->cprowIndices->begin()))),
1519         thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(),
1520         VecCUSPPlusEquals());
1521     ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1522     ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr);
1523     ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
1524 
1525   } catch(char *ex) {
1526     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1527   }
1528   ierr = WaitForGPU();CHKERRCUSP(ierr);
1529   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1530   PetscFunctionReturn(0);
1531 }
1532 
1533 #undef __FUNCT__
1534 #define __FUNCT__ "MatMultTransposeAdd_SeqAIJCUSPARSE"
1535 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1536 {
1537   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1538   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1539   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1540   CUSPARRAY                    *xarray,*yarray,*zarray;
1541   PetscErrorCode               ierr;
1542   cusparseStatus_t             stat;
1543 
1544   PetscFunctionBegin;
1545   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
1546      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
1547   if (!matstructT) {
1548     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1549     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1550   }
1551 
1552   try {
1553     ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr);
1554     ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1555     ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr);
1556     ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
1557 
1558     /* multiply add with matrix transpose */
1559     if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1560       CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1561       /* here we need to be careful to set the number of rows in the multiply to the
1562          number of compressed rows in the matrix ... which is equivalent to the
1563          size of the workVector */
1564       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1565                                mat->num_rows, mat->num_cols,
1566                                mat->num_entries, matstructT->alpha, matstructT->descr,
1567                                mat->values->data().get(), mat->row_offsets->data().get(),
1568                                mat->column_indices->data().get(), xarray->data().get(), matstructT->beta,
1569                                cusparsestruct->workVector->data().get());CHKERRCUSP(stat);
1570     } else {
1571 #if CUDA_VERSION>=4020
1572       cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1573       if (cusparsestruct->workVector->size()) {
1574         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1575             matstructT->alpha, matstructT->descr, hybMat,
1576             xarray->data().get(), matstructT->beta,
1577             cusparsestruct->workVector->data().get());CHKERRCUSP(stat);
1578       }
1579 #endif
1580     }
1581 
1582     /* scatter the data from the temporary into the full vector with a += operation */
1583     thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstructT->cprowIndices->begin()))),
1584         thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstructT->cprowIndices->begin()))) + cusparsestruct->workVector->size(),
1585         VecCUSPPlusEquals());
1586 
1587     ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1588     ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr);
1589     ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
1590 
1591   } catch(char *ex) {
1592     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1593   }
1594   ierr = WaitForGPU();CHKERRCUSP(ierr);
1595   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1596   PetscFunctionReturn(0);
1597 }
1598 
1599 #undef __FUNCT__
1600 #define __FUNCT__ "MatAssemblyEnd_SeqAIJCUSPARSE"
1601 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
1602 {
1603   PetscErrorCode ierr;
1604 
1605   PetscFunctionBegin;
1606   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
1607   if (A->factortype==MAT_FACTOR_NONE) {
1608     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1609   }
1610   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
1611   A->ops->mult             = MatMult_SeqAIJCUSPARSE;
1612   A->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1613   A->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1614   A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1615   PetscFunctionReturn(0);
1616 }
1617 
1618 /* --------------------------------------------------------------------------------*/
1619 #undef __FUNCT__
1620 #define __FUNCT__ "MatCreateSeqAIJCUSPARSE"
1621 /*@
1622    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
1623    (the default parallel PETSc format). This matrix will ultimately pushed down
1624    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
1625    assembly performance the user should preallocate the matrix storage by setting
1626    the parameter nz (or the array nnz).  By setting these parameters accurately,
1627    performance during matrix assembly can be increased by more than a factor of 50.
1628 
1629    Collective on MPI_Comm
1630 
1631    Input Parameters:
1632 +  comm - MPI communicator, set to PETSC_COMM_SELF
1633 .  m - number of rows
1634 .  n - number of columns
1635 .  nz - number of nonzeros per row (same for all rows)
1636 -  nnz - array containing the number of nonzeros in the various rows
1637          (possibly different for each row) or NULL
1638 
1639    Output Parameter:
1640 .  A - the matrix
1641 
1642    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1643    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1644    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
1645 
1646    Notes:
1647    If nnz is given then nz is ignored
1648 
1649    The AIJ format (also called the Yale sparse matrix format or
1650    compressed row storage), is fully compatible with standard Fortran 77
1651    storage.  That is, the stored row and column indices can begin at
1652    either one (as in Fortran) or zero.  See the users' manual for details.
1653 
1654    Specify the preallocated storage with either nz or nnz (not both).
1655    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1656    allocation.  For large problems you MUST preallocate memory or you
1657    will get TERRIBLE performance, see the users' manual chapter on matrices.
1658 
1659    By default, this format uses inodes (identical nodes) when possible, to
1660    improve numerical efficiency of matrix-vector products and solves. We
1661    search for consecutive rows with the same nonzero structure, thereby
1662    reusing matrix information to achieve increased efficiency.
1663 
1664    Level: intermediate
1665 
1666 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
1667 @*/
1668 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1669 {
1670   PetscErrorCode ierr;
1671 
1672   PetscFunctionBegin;
1673   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1674   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
1675   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1676   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
1677   PetscFunctionReturn(0);
1678 }
1679 
1680 #undef __FUNCT__
1681 #define __FUNCT__ "MatDestroy_SeqAIJCUSPARSE"
1682 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
1683 {
1684   PetscErrorCode   ierr;
1685   cusparseStatus_t stat;
1686   cudaError_t      err;
1687   PetscFunctionBegin;
1688   if (A->factortype==MAT_FACTOR_NONE) {
1689     try {
1690       if (A->valid_GPU_matrix != PETSC_CUSP_UNALLOCATED) {
1691         Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1692         Mat_SeqAIJCUSPARSEMultStruct *matstruct  = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1693         Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1694 
1695         /* delete all the members in each of the data structures */
1696         if (matstruct) {
1697           if (matstruct->descr) { stat = cusparseDestroyMatDescr(matstruct->descr);CHKERRCUSP(stat); }
1698           if (matstruct->mat) {
1699             if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1700 #if CUDA_VERSION>=4020
1701               cusparseHybMat_t hybMat = (cusparseHybMat_t) matstruct->mat;
1702               stat = cusparseDestroyHybMat(hybMat);CHKERRCUSP(stat);
1703 #endif
1704             } else {
1705               CsrMatrix* mat = (CsrMatrix*)matstruct->mat;
1706               if (mat->values) delete (THRUSTARRAY*)mat->values;
1707               if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1708               if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1709               delete (CsrMatrix*)matstruct->mat;
1710             }
1711           }
1712           if (matstruct->alpha) { err=cudaFree(matstruct->alpha);CHKERRCUSP(err); }
1713           if (matstruct->beta) { err=cudaFree(matstruct->beta);CHKERRCUSP(err); }
1714           if (matstruct->cprowIndices) delete (THRUSTINTARRAY*)matstruct->cprowIndices;
1715           if (matstruct) delete (Mat_SeqAIJCUSPARSEMultStruct*)matstruct;
1716         }
1717         if (matstructT) {
1718           if (matstructT->descr) { stat = cusparseDestroyMatDescr(matstructT->descr);CHKERRCUSP(stat); }
1719           if (matstructT->mat) {
1720             if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1721 #if CUDA_VERSION>=4020
1722               cusparseHybMat_t hybMat = (cusparseHybMat_t) matstructT->mat;
1723               stat = cusparseDestroyHybMat(hybMat);CHKERRCUSP(stat);
1724 #endif
1725             } else {
1726               CsrMatrix* matT = (CsrMatrix*)matstructT->mat;
1727               if (matT->values) delete (THRUSTARRAY*)matT->values;
1728               if (matT->column_indices) delete (THRUSTINTARRAY32*)matT->column_indices;
1729               if (matT->row_offsets) delete (THRUSTINTARRAY32*)matT->row_offsets;
1730               delete (CsrMatrix*)matstructT->mat;
1731             }
1732           }
1733           if (matstructT->alpha) { err=cudaFree(matstructT->alpha);CHKERRCUSP(err); }
1734           if (matstructT->beta) { err=cudaFree(matstructT->beta);CHKERRCUSP(err); }
1735           if (matstructT->cprowIndices) delete (THRUSTINTARRAY*)matstructT->cprowIndices;
1736           if (matstructT) delete (Mat_SeqAIJCUSPARSEMultStruct*)matstructT;
1737         }
1738         if (cusparsestruct->workVector) delete (THRUSTARRAY*)cusparsestruct->workVector;
1739         if (cusparsestruct->handle) { stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSP(stat); }
1740         if (cusparsestruct) delete (Mat_SeqAIJCUSPARSE*)cusparsestruct;
1741         A->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED;
1742       } else {
1743         Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1744         if (cusparsestruct->handle) { stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSP(stat); }
1745         if (cusparsestruct) delete (Mat_SeqAIJCUSPARSE*)cusparsestruct;
1746       }
1747     } catch(char *ex) {
1748       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1749     }
1750   } else {
1751     /* The triangular factors */
1752     try {
1753       Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1754       Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1755       Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1756       Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1757       Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1758 
1759       /* delete all the members in each of the data structures */
1760       if (loTriFactor) {
1761         if (loTriFactor->descr) { stat = cusparseDestroyMatDescr(loTriFactor->descr);CHKERRCUSP(stat); }
1762         if (loTriFactor->solveInfo) { stat = cusparseDestroySolveAnalysisInfo(loTriFactor->solveInfo);CHKERRCUSP(stat); }
1763         if (loTriFactor->csrMat) {
1764           if (loTriFactor->csrMat->values) delete (THRUSTARRAY*)loTriFactor->csrMat->values;
1765           if (loTriFactor->csrMat->column_indices) delete (THRUSTINTARRAY32*)loTriFactor->csrMat->column_indices;
1766           if (loTriFactor->csrMat->row_offsets) delete (THRUSTINTARRAY32*)loTriFactor->csrMat->row_offsets;
1767           delete (CsrMatrix*)loTriFactor->csrMat;
1768         }
1769         if (loTriFactor) delete (Mat_SeqAIJCUSPARSETriFactorStruct*)loTriFactor;
1770       }
1771 
1772       if (upTriFactor) {
1773         if (upTriFactor->descr) { stat = cusparseDestroyMatDescr(upTriFactor->descr);CHKERRCUSP(stat); }
1774         if (upTriFactor->solveInfo) { stat = cusparseDestroySolveAnalysisInfo(upTriFactor->solveInfo);CHKERRCUSP(stat); }
1775         if (upTriFactor->csrMat) {
1776           if (upTriFactor->csrMat->values) delete (THRUSTARRAY*)upTriFactor->csrMat->values;
1777           if (upTriFactor->csrMat->column_indices) delete (THRUSTINTARRAY32*)upTriFactor->csrMat->column_indices;
1778           if (upTriFactor->csrMat->row_offsets) delete (THRUSTINTARRAY32*)upTriFactor->csrMat->row_offsets;
1779           delete (CsrMatrix*)upTriFactor->csrMat;
1780         }
1781         if (upTriFactor) delete (Mat_SeqAIJCUSPARSETriFactorStruct*)upTriFactor;
1782       }
1783 
1784       if (loTriFactorT) {
1785         if (loTriFactorT->descr) { stat = cusparseDestroyMatDescr(loTriFactorT->descr);CHKERRCUSP(stat); }
1786         if (loTriFactorT->solveInfo) { stat = cusparseDestroySolveAnalysisInfo(loTriFactorT->solveInfo);CHKERRCUSP(stat); }
1787         if (loTriFactorT->csrMat) {
1788           if (loTriFactorT->csrMat->values) delete (THRUSTARRAY*)loTriFactorT->csrMat->values;
1789           if (loTriFactorT->csrMat->column_indices) delete (THRUSTINTARRAY32*)loTriFactorT->csrMat->column_indices;
1790           if (loTriFactorT->csrMat->row_offsets) delete (THRUSTINTARRAY32*)loTriFactorT->csrMat->row_offsets;
1791           delete (CsrMatrix*)loTriFactorT->csrMat;
1792         }
1793         if (loTriFactorT) delete (Mat_SeqAIJCUSPARSETriFactorStruct*)loTriFactorT;
1794       }
1795 
1796       if (upTriFactorT) {
1797         if (upTriFactorT->descr) { stat = cusparseDestroyMatDescr(upTriFactorT->descr);CHKERRCUSP(stat); }
1798         if (upTriFactorT->solveInfo) { stat = cusparseDestroySolveAnalysisInfo(upTriFactorT->solveInfo);CHKERRCUSP(stat); }
1799         if (upTriFactorT->csrMat) {
1800           if (upTriFactorT->csrMat->values) delete (THRUSTARRAY*)upTriFactorT->csrMat->values;
1801           if (upTriFactorT->csrMat->column_indices) delete (THRUSTINTARRAY32*)upTriFactorT->csrMat->column_indices;
1802           if (upTriFactorT->csrMat->row_offsets) delete (THRUSTINTARRAY32*)upTriFactorT->csrMat->row_offsets;
1803           delete (CsrMatrix*)upTriFactorT->csrMat;
1804         }
1805         if (upTriFactorT) delete (Mat_SeqAIJCUSPARSETriFactorStruct*)upTriFactorT;
1806       }
1807       if (cusparseTriFactors->rpermIndices) delete (THRUSTINTARRAY*)cusparseTriFactors->rpermIndices;
1808       if (cusparseTriFactors->cpermIndices) delete (THRUSTINTARRAY*)cusparseTriFactors->cpermIndices;
1809       if (cusparseTriFactors->workVector) delete (THRUSTARRAY*)cusparseTriFactors->workVector;
1810       if (cusparseTriFactors->handle) { stat = cusparseDestroy(cusparseTriFactors->handle);CHKERRCUSP(stat); }
1811       if (cusparseTriFactors) delete (Mat_SeqAIJCUSPARSETriFactors*)cusparseTriFactors;
1812     } catch(char *ex) {
1813       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1814     }
1815   }
1816   /*this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */
1817   A->spptr = 0;
1818 
1819   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
1820   PetscFunctionReturn(0);
1821 }
1822 
1823 #undef __FUNCT__
1824 #define __FUNCT__ "MatCreate_SeqAIJCUSPARSE"
1825 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
1826 {
1827   PetscErrorCode ierr;
1828   cusparseStatus_t stat;
1829   cusparseHandle_t handle=0;
1830 
1831   PetscFunctionBegin;
1832   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
1833   if (B->factortype==MAT_FACTOR_NONE) {
1834     /* you cannot check the inode.use flag here since the matrix was just created.
1835        now build a GPU matrix data structure */
1836     B->spptr = new Mat_SeqAIJCUSPARSE;
1837     ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat          = 0;
1838     ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0;
1839     ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector   = 0;
1840     ((Mat_SeqAIJCUSPARSE*)B->spptr)->format       = MAT_CUSPARSE_CSR;
1841     ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream       = 0;
1842     ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle       = 0;
1843     stat = cusparseCreate(&handle);CHKERRCUSP(stat);
1844     ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle       = handle;
1845     ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream       = 0;
1846   } else {
1847     /* NEXT, set the pointers to the triangular factors */
1848     B->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1849     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr          = 0;
1850     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr          = 0;
1851     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0;
1852     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0;
1853     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices            = 0;
1854     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices            = 0;
1855     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector              = 0;
1856     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle                  = 0;
1857     stat = cusparseCreate(&handle);CHKERRCUSP(stat);
1858     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle                  = handle;
1859     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz                     = 0;
1860   }
1861 
1862   /* Here we overload MatGetFactor_petsc_C which enables -mat_type aijcusparse to use the
1863      default cusparse tri solve. Note the difference with the implementation in
1864      MatCreate_SeqAIJCUSP in ../seqcusp/aijcusp.cu */
1865   ierr = PetscObjectComposeFunction((PetscObject)B,"MatGetFactor_petsc_C",MatGetFactor_seqaij_cusparse);CHKERRQ(ierr);
1866 
1867   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
1868   B->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
1869   B->ops->getvecs          = MatCreateVecs_SeqAIJCUSPARSE;
1870   B->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
1871   B->ops->mult             = MatMult_SeqAIJCUSPARSE;
1872   B->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1873   B->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1874   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1875 
1876   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1877 
1878   B->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED;
1879 
1880   ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
1881   PetscFunctionReturn(0);
1882 }
1883 
1884 /*M
1885    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
1886 
1887    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
1888    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
1889    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
1890 
1891    Options Database Keys:
1892 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
1893 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1894 .  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1895 
1896   Level: beginner
1897 
1898 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
1899 M*/
1900