xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 2692e278da2d32099a5c231d6e234531f8341747)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 
6 #include "petscconf.h"
7 #include "../src/mat/impls/aij/seq/aij.h"          /*I "petscmat.h" I*/
8 #include <../src/mat/impls/sbaij/seq/sbaij.h>
9 #include "../src/vec/vec/impls/dvecimpl.h"
10 #include "petsc-private/vecimpl.h"
11 #undef VecType
12 #include "cusparsematimpl.h"
13 
14 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
15 
16 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
17 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
18 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
19 
20 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
21 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
22 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
23 
24 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
25 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
26 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
27 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
28 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat);
29 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
30 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
31 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
32 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
33 
34 #undef __FUNCT__
35 #define __FUNCT__ "MatCUSPARSESetStream"
36 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
37 {
38   cusparseStatus_t   stat;
39   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
40 
41   PetscFunctionBegin;
42   cusparsestruct->stream = stream;
43   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSP(stat);
44   PetscFunctionReturn(0);
45 }
46 
47 #undef __FUNCT__
48 #define __FUNCT__ "MatCUSPARSESetHandle"
49 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
50 {
51   cusparseStatus_t   stat;
52   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
53 
54   PetscFunctionBegin;
55   if (cusparsestruct->handle)
56     stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSP(stat);
57   cusparsestruct->handle = handle;
58   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSP(stat);
59   PetscFunctionReturn(0);
60 }
61 
62 #undef __FUNCT__
63 #define __FUNCT__ "MatCUSPARSEClearHandle"
64 PetscErrorCode MatCUSPARSEClearHandle(Mat A)
65 {
66   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
67   PetscFunctionBegin;
68   if (cusparsestruct->handle)
69     cusparsestruct->handle = 0;
70   PetscFunctionReturn(0);
71 }
72 
73 #undef __FUNCT__
74 #define __FUNCT__ "MatFactorGetSolverPackage_seqaij_cusparse"
75 PetscErrorCode MatFactorGetSolverPackage_seqaij_cusparse(Mat A,const MatSolverPackage *type)
76 {
77   PetscFunctionBegin;
78   *type = MATSOLVERCUSPARSE;
79   PetscFunctionReturn(0);
80 }
81 
82 /*MC
83   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
84   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
85   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
86   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
87   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
88   algorithms are not recommended. This class does NOT support direct solver operations.
89 
90   Consult CUSPARSE documentation for more information about the matrix storage formats
91   which correspond to the options database keys below.
92 
93    Options Database Keys:
94 .  -mat_cusparse_solve_storage_format csr - sets the storage format matrices (for factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
95 
96   Level: beginner
97 
98 .seealso: PCFactorSetMatSolverPackage(), MatSolverPackage, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
99 M*/
100 
101 #undef __FUNCT__
102 #define __FUNCT__ "MatGetFactor_seqaij_cusparse"
103 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaij_cusparse(Mat A,MatFactorType ftype,Mat *B)
104 {
105   PetscErrorCode ierr;
106   PetscInt       n = A->rmap->n;
107 
108   PetscFunctionBegin;
109   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
110   (*B)->factortype = ftype;
111   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
112   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
113 
114   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
115     ierr = MatSetBlockSizes(*B,A->rmap->bs,A->cmap->bs);CHKERRQ(ierr);
116     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
117     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
118   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
119     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
120     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
121   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
122 
123   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
124   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverPackage_C",MatFactorGetSolverPackage_seqaij_cusparse);CHKERRQ(ierr);
125   PetscFunctionReturn(0);
126 }
127 
128 #undef __FUNCT__
129 #define __FUNCT__ "MatCUSPARSESetFormat_SeqAIJCUSPARSE"
130 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
131 {
132   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
133 
134   PetscFunctionBegin;
135 #if CUDA_VERSION>=4020
136   switch (op) {
137   case MAT_CUSPARSE_MULT:
138     cusparsestruct->format = format;
139     break;
140   case MAT_CUSPARSE_SOLVE:
141     cusparseMatSolveStorageFormat = format;
142     break;
143   case MAT_CUSPARSE_ALL:
144     cusparsestruct->format = format;
145     cusparseMatSolveStorageFormat = format;
146     break;
147   default:
148     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT, MAT_CUSPARSE_SOLVE, and MAT_CUSPARSE_ALL are currently supported.",op);
149   }
150 #else
151   if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB)
152     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format require CUDA 4.2 or later.");
153 #endif
154   PetscFunctionReturn(0);
155 }
156 
157 /*@
158    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
159    operation. Only the MatMult operation can use different GPU storage formats
160    for MPIAIJCUSPARSE matrices.
161    Not Collective
162 
163    Input Parameters:
164 +  A - Matrix of type SEQAIJCUSPARSE
165 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT, MAT_CUSPARSE_SOLVE, and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
166 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
167 
168    Output Parameter:
169 
170    Level: intermediate
171 
172 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
173 @*/
174 #undef __FUNCT__
175 #define __FUNCT__ "MatCUSPARSESetFormat"
176 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
177 {
178   PetscErrorCode ierr;
179 
180   PetscFunctionBegin;
181   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
182   ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
183   PetscFunctionReturn(0);
184 }
185 
186 #undef __FUNCT__
187 #define __FUNCT__ "MatSetFromOptions_SeqAIJCUSPARSE"
188 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A)
189 {
190   PetscErrorCode           ierr;
191   MatCUSPARSEStorageFormat format;
192   PetscBool                flg;
193 
194   PetscFunctionBegin;
195   ierr = PetscOptionsHead("SeqAIJCUSPARSE options");CHKERRQ(ierr);
196   ierr = PetscObjectOptionsBegin((PetscObject)A);
197   if (A->factortype==MAT_FACTOR_NONE) {
198     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
199                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
200     if (flg) {
201       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);
202     }
203   } else {
204     ierr = PetscOptionsEnum("-mat_cusparse_solve_storage_format","sets storage format of (seq)aijcusparse gpu matrices for TriSolve",
205                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
206     if (flg) {
207       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_SOLVE,format);CHKERRQ(ierr);
208     }
209   }
210   ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
211                           "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
212   if (flg) {
213     ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
214   }
215   ierr = PetscOptionsEnd();CHKERRQ(ierr);
216   PetscFunctionReturn(0);
217 
218 }
219 
220 #undef __FUNCT__
221 #define __FUNCT__ "MatILUFactorSymbolic_SeqAIJCUSPARSE"
222 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
223 {
224   PetscErrorCode ierr;
225 
226   PetscFunctionBegin;
227   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
228   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
229   PetscFunctionReturn(0);
230 }
231 
232 #undef __FUNCT__
233 #define __FUNCT__ "MatLUFactorSymbolic_SeqAIJCUSPARSE"
234 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
235 {
236   PetscErrorCode ierr;
237 
238   PetscFunctionBegin;
239   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
240   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
241   PetscFunctionReturn(0);
242 }
243 
244 #undef __FUNCT__
245 #define __FUNCT__ "MatICCFactorSymbolic_SeqAIJCUSPARSE"
246 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
247 {
248   PetscErrorCode ierr;
249 
250   PetscFunctionBegin;
251   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
252   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
253   PetscFunctionReturn(0);
254 }
255 
256 #undef __FUNCT__
257 #define __FUNCT__ "MatCholeskyFactorSymbolic_SeqAIJCUSPARSE"
258 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
259 {
260   PetscErrorCode ierr;
261 
262   PetscFunctionBegin;
263   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
264   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
265   PetscFunctionReturn(0);
266 }
267 
268 #undef __FUNCT__
269 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildILULowerTriMatrix"
270 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
271 {
272   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
273   PetscInt                          n = A->rmap->n;
274   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
275   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
276   cusparseStatus_t                  stat;
277   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
278   const MatScalar                   *aa = a->a,*v;
279   PetscInt                          *AiLo, *AjLo;
280   PetscScalar                       *AALo;
281   PetscInt                          i,nz, nzLower, offset, rowOffset;
282   PetscErrorCode                    ierr;
283 
284   PetscFunctionBegin;
285   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
286     try {
287       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
288       nzLower=n+ai[n]-ai[1];
289 
290       /* Allocate Space for the lower triangular matrix */
291       ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
292       ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUSP(ierr);
293       ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUSP(ierr);
294 
295       /* Fill the lower triangular matrix */
296       AiLo[0]  = (PetscInt) 0;
297       AiLo[n]  = nzLower;
298       AjLo[0]  = (PetscInt) 0;
299       AALo[0]  = (MatScalar) 1.0;
300       v        = aa;
301       vi       = aj;
302       offset   = 1;
303       rowOffset= 1;
304       for (i=1; i<n; i++) {
305         nz = ai[i+1] - ai[i];
306         /* additional 1 for the term on the diagonal */
307         AiLo[i]    = rowOffset;
308         rowOffset += nz+1;
309 
310         ierr = PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr);
311         ierr = PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
312 
313         offset      += nz;
314         AjLo[offset] = (PetscInt) i;
315         AALo[offset] = (MatScalar) 1.0;
316         offset      += 1;
317 
318         v  += nz;
319         vi += nz;
320       }
321 
322       /* allocate space for the triangular factor information */
323       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
324 
325       /* Create the matrix description */
326       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSP(stat);
327       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat);
328       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat);
329       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSP(stat);
330       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSP(stat);
331 
332       /* Create the solve analysis information */
333       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSP(stat);
334 
335       /* set the operation */
336       loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
337 
338       /* set the matrix */
339       loTriFactor->csrMat = new CsrMatrix;
340       loTriFactor->csrMat->num_rows = n;
341       loTriFactor->csrMat->num_cols = n;
342       loTriFactor->csrMat->num_entries = nzLower;
343 
344       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
345       loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
346 
347       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
348       loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
349 
350       loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
351       loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
352 
353       /* perform the solve analysis */
354       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
355                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
356                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
357                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSP(stat);
358 
359       /* assign the pointer. Is this really necessary? */
360       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
361 
362       ierr = cudaFreeHost(AiLo);CHKERRCUSP(ierr);
363       ierr = cudaFreeHost(AjLo);CHKERRCUSP(ierr);
364       ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr);
365     } catch(char *ex) {
366       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
367     }
368   }
369   PetscFunctionReturn(0);
370 }
371 
372 #undef __FUNCT__
373 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildILUUpperTriMatrix"
374 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
375 {
376   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
377   PetscInt                          n = A->rmap->n;
378   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
379   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
380   cusparseStatus_t                  stat;
381   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
382   const MatScalar                   *aa = a->a,*v;
383   PetscInt                          *AiUp, *AjUp;
384   PetscScalar                       *AAUp;
385   PetscInt                          i,nz, nzUpper, offset;
386   PetscErrorCode                    ierr;
387 
388   PetscFunctionBegin;
389   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
390     try {
391       /* next, figure out the number of nonzeros in the upper triangular matrix. */
392       nzUpper = adiag[0]-adiag[n];
393 
394       /* Allocate Space for the upper triangular matrix */
395       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
396       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr);
397       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr);
398 
399       /* Fill the upper triangular matrix */
400       AiUp[0]=(PetscInt) 0;
401       AiUp[n]=nzUpper;
402       offset = nzUpper;
403       for (i=n-1; i>=0; i--) {
404         v  = aa + adiag[i+1] + 1;
405         vi = aj + adiag[i+1] + 1;
406 
407         /* number of elements NOT on the diagonal */
408         nz = adiag[i] - adiag[i+1]-1;
409 
410         /* decrement the offset */
411         offset -= (nz+1);
412 
413         /* first, set the diagonal elements */
414         AjUp[offset] = (PetscInt) i;
415         AAUp[offset] = 1./v[nz];
416         AiUp[i]      = AiUp[i+1] - (nz+1);
417 
418         ierr = PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr);
419         ierr = PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
420       }
421 
422       /* allocate space for the triangular factor information */
423       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
424 
425       /* Create the matrix description */
426       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSP(stat);
427       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat);
428       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat);
429       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat);
430       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat);
431 
432       /* Create the solve analysis information */
433       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSP(stat);
434 
435       /* set the operation */
436       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
437 
438       /* set the matrix */
439       upTriFactor->csrMat = new CsrMatrix;
440       upTriFactor->csrMat->num_rows = n;
441       upTriFactor->csrMat->num_cols = n;
442       upTriFactor->csrMat->num_entries = nzUpper;
443 
444       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
445       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
446 
447       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
448       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
449 
450       upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
451       upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
452 
453       /* perform the solve analysis */
454       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
455                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
456                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
457                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSP(stat);
458 
459       /* assign the pointer. Is this really necessary? */
460       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
461 
462       ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr);
463       ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr);
464       ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr);
465     } catch(char *ex) {
466       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
467     }
468   }
469   PetscFunctionReturn(0);
470 }
471 
472 #undef __FUNCT__
473 #define __FUNCT__ "MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU"
474 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
475 {
476   PetscErrorCode               ierr;
477   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
478   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
479   IS                           isrow = a->row,iscol = a->icol;
480   PetscBool                    row_identity,col_identity;
481   const PetscInt               *r,*c;
482   PetscInt                     n = A->rmap->n;
483 
484   PetscFunctionBegin;
485   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
486   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
487 
488   cusparseTriFactors->workVector = new THRUSTARRAY;
489   cusparseTriFactors->workVector->resize(n);
490   cusparseTriFactors->nnz=a->nz;
491 
492   A->valid_GPU_matrix = PETSC_CUSP_BOTH;
493   /*lower triangular indices */
494   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
495   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
496   if (!row_identity) {
497     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
498     cusparseTriFactors->rpermIndices->assign(r, r+n);
499   }
500   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
501 
502   /*upper triangular indices */
503   ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
504   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
505   if (!col_identity) {
506     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
507     cusparseTriFactors->cpermIndices->assign(c, c+n);
508   }
509   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
510   PetscFunctionReturn(0);
511 }
512 
513 #undef __FUNCT__
514 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildICCTriMatrices"
515 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
516 {
517   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
518   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
519   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
520   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
521   cusparseStatus_t                  stat;
522   PetscErrorCode                    ierr;
523   PetscInt                          *AiUp, *AjUp;
524   PetscScalar                       *AAUp;
525   PetscScalar                       *AALo;
526   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
527   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
528   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
529   const MatScalar                   *aa = b->a,*v;
530 
531   PetscFunctionBegin;
532   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
533     try {
534       /* Allocate Space for the upper triangular matrix */
535       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
536       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr);
537       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr);
538       ierr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr);
539 
540       /* Fill the upper triangular matrix */
541       AiUp[0]=(PetscInt) 0;
542       AiUp[n]=nzUpper;
543       offset = 0;
544       for (i=0; i<n; i++) {
545         /* set the pointers */
546         v  = aa + ai[i];
547         vj = aj + ai[i];
548         nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
549 
550         /* first, set the diagonal elements */
551         AjUp[offset] = (PetscInt) i;
552         AAUp[offset] = 1.0/v[nz];
553         AiUp[i]      = offset;
554         AALo[offset] = 1.0/v[nz];
555 
556         offset+=1;
557         if (nz>0) {
558           ierr = PetscMemcpy(&(AjUp[offset]), vj, nz*sizeof(PetscInt));CHKERRQ(ierr);
559           ierr = PetscMemcpy(&(AAUp[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
560           for (j=offset; j<offset+nz; j++) {
561             AAUp[j] = -AAUp[j];
562             AALo[j] = AAUp[j]/v[nz];
563           }
564           offset+=nz;
565         }
566       }
567 
568       /* allocate space for the triangular factor information */
569       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
570 
571       /* Create the matrix description */
572       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSP(stat);
573       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat);
574       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat);
575       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat);
576       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSP(stat);
577 
578       /* Create the solve analysis information */
579       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSP(stat);
580 
581       /* set the operation */
582       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
583 
584       /* set the matrix */
585       upTriFactor->csrMat = new CsrMatrix;
586       upTriFactor->csrMat->num_rows = A->rmap->n;
587       upTriFactor->csrMat->num_cols = A->cmap->n;
588       upTriFactor->csrMat->num_entries = a->nz;
589 
590       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
591       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
592 
593       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
594       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
595 
596       upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
597       upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
598 
599       /* perform the solve analysis */
600       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
601                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
602                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
603                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSP(stat);
604 
605       /* assign the pointer. Is this really necessary? */
606       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
607 
608       /* allocate space for the triangular factor information */
609       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
610 
611       /* Create the matrix description */
612       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSP(stat);
613       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat);
614       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat);
615       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat);
616       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat);
617 
618       /* Create the solve analysis information */
619       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSP(stat);
620 
621       /* set the operation */
622       loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
623 
624       /* set the matrix */
625       loTriFactor->csrMat = new CsrMatrix;
626       loTriFactor->csrMat->num_rows = A->rmap->n;
627       loTriFactor->csrMat->num_cols = A->cmap->n;
628       loTriFactor->csrMat->num_entries = a->nz;
629 
630       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
631       loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
632 
633       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
634       loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
635 
636       loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
637       loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
638 
639       /* perform the solve analysis */
640       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
641                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
642                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
643                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSP(stat);
644 
645       /* assign the pointer. Is this really necessary? */
646       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
647 
648       A->valid_GPU_matrix = PETSC_CUSP_BOTH;
649       ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr);
650       ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr);
651       ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr);
652       ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr);
653     } catch(char *ex) {
654       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
655     }
656   }
657   PetscFunctionReturn(0);
658 }
659 
660 #undef __FUNCT__
661 #define __FUNCT__ "MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU"
662 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
663 {
664   PetscErrorCode               ierr;
665   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
666   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
667   IS                           ip = a->row;
668   const PetscInt               *rip;
669   PetscBool                    perm_identity;
670   PetscInt                     n = A->rmap->n;
671 
672   PetscFunctionBegin;
673   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
674   cusparseTriFactors->workVector = new THRUSTARRAY;
675   cusparseTriFactors->workVector->resize(n);
676   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
677 
678   /*lower triangular indices */
679   ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
680   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
681   if (!perm_identity) {
682     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
683     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
684     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
685     cusparseTriFactors->cpermIndices->assign(rip, rip+n);
686   }
687   ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
688   PetscFunctionReturn(0);
689 }
690 
691 #undef __FUNCT__
692 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJCUSPARSE"
693 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
694 {
695   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
696   IS             isrow = b->row,iscol = b->col;
697   PetscBool      row_identity,col_identity;
698   PetscErrorCode ierr;
699 
700   PetscFunctionBegin;
701   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
702   /* determine which version of MatSolve needs to be used. */
703   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
704   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
705   if (row_identity && col_identity) {
706     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
707     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
708   } else {
709     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
710     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
711   }
712 
713   /* get the triangular factors */
714   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
715   PetscFunctionReturn(0);
716 }
717 
718 #undef __FUNCT__
719 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqAIJCUSPARSE"
720 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
721 {
722   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
723   IS             ip = b->row;
724   PetscBool      perm_identity;
725   PetscErrorCode ierr;
726 
727   PetscFunctionBegin;
728   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
729 
730   /* determine which version of MatSolve needs to be used. */
731   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
732   if (perm_identity) {
733     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
734     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
735   } else {
736     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
737     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
738   }
739 
740   /* get the triangular factors */
741   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
742   PetscFunctionReturn(0);
743 }
744 
745 #undef __FUNCT__
746 #define __FUNCT__ "MatSeqAIJCUSPARSEAnalyzeTransposeForSolve"
747 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
748 {
749   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
750   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
751   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
752   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
753   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
754   cusparseStatus_t                  stat;
755   cusparseIndexBase_t               indexBase;
756   cusparseMatrixType_t              matrixType;
757   cusparseFillMode_t                fillMode;
758   cusparseDiagType_t                diagType;
759 
760   PetscFunctionBegin;
761 
762   /*********************************************/
763   /* Now the Transpose of the Lower Tri Factor */
764   /*********************************************/
765 
766   /* allocate space for the transpose of the lower triangular factor */
767   loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
768 
769   /* set the matrix descriptors of the lower triangular factor */
770   matrixType = cusparseGetMatType(loTriFactor->descr);
771   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
772   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
773     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
774   diagType = cusparseGetMatDiagType(loTriFactor->descr);
775 
776   /* Create the matrix description */
777   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSP(stat);
778   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSP(stat);
779   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSP(stat);
780   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSP(stat);
781   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSP(stat);
782 
783   /* Create the solve analysis information */
784   stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUSP(stat);
785 
786   /* set the operation */
787   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
788 
789   /* allocate GPU space for the CSC of the lower triangular factor*/
790   loTriFactorT->csrMat = new CsrMatrix;
791   loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows;
792   loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols;
793   loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
794   loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1);
795   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries);
796   loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries);
797 
798   /* compute the transpose of the lower triangular factor, i.e. the CSC */
799   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
800                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
801                           loTriFactor->csrMat->values->data().get(),
802                           loTriFactor->csrMat->row_offsets->data().get(),
803                           loTriFactor->csrMat->column_indices->data().get(),
804                           loTriFactorT->csrMat->values->data().get(),
805                           loTriFactorT->csrMat->column_indices->data().get(),
806                           loTriFactorT->csrMat->row_offsets->data().get(),
807                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat);
808 
809   /* perform the solve analysis on the transposed matrix */
810   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
811                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries,
812                            loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
813                            loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(),
814                            loTriFactorT->solveInfo);CHKERRCUSP(stat);
815 
816   /* assign the pointer. Is this really necessary? */
817   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
818 
819   /*********************************************/
820   /* Now the Transpose of the Upper Tri Factor */
821   /*********************************************/
822 
823   /* allocate space for the transpose of the upper triangular factor */
824   upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
825 
826   /* set the matrix descriptors of the upper triangular factor */
827   matrixType = cusparseGetMatType(upTriFactor->descr);
828   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
829   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
830     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
831   diagType = cusparseGetMatDiagType(upTriFactor->descr);
832 
833   /* Create the matrix description */
834   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSP(stat);
835   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSP(stat);
836   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSP(stat);
837   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSP(stat);
838   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSP(stat);
839 
840   /* Create the solve analysis information */
841   stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUSP(stat);
842 
843   /* set the operation */
844   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
845 
846   /* allocate GPU space for the CSC of the upper triangular factor*/
847   upTriFactorT->csrMat = new CsrMatrix;
848   upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows;
849   upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols;
850   upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
851   upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1);
852   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries);
853   upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries);
854 
855   /* compute the transpose of the upper triangular factor, i.e. the CSC */
856   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
857                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
858                           upTriFactor->csrMat->values->data().get(),
859                           upTriFactor->csrMat->row_offsets->data().get(),
860                           upTriFactor->csrMat->column_indices->data().get(),
861                           upTriFactorT->csrMat->values->data().get(),
862                           upTriFactorT->csrMat->column_indices->data().get(),
863                           upTriFactorT->csrMat->row_offsets->data().get(),
864                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat);
865 
866   /* perform the solve analysis on the transposed matrix */
867   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
868                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries,
869                            upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
870                            upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(),
871                            upTriFactorT->solveInfo);CHKERRCUSP(stat);
872 
873   /* assign the pointer. Is this really necessary? */
874   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
875   PetscFunctionReturn(0);
876 }
877 
878 #undef __FUNCT__
879 #define __FUNCT__ "MatSeqAIJCUSPARSEGenerateTransposeForMult"
880 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
881 {
882   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
883   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
884   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
885   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
886   cusparseStatus_t             stat;
887   cusparseIndexBase_t          indexBase;
888   cudaError_t                  err;
889 
890   PetscFunctionBegin;
891 
892   /* allocate space for the triangular factor information */
893   matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
894   stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSP(stat);
895   indexBase = cusparseGetMatIndexBase(matstruct->descr);
896   stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSP(stat);
897   stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSP(stat);
898 
899   /* set alpha and beta */
900   err = cudaMalloc((void **)&(matstructT->alpha),sizeof(PetscScalar));CHKERRCUSP(err);
901   err = cudaMemcpy(matstructT->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err);
902   err = cudaMalloc((void **)&(matstructT->beta),sizeof(PetscScalar));CHKERRCUSP(err);
903   err = cudaMemcpy(matstructT->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err);
904   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSP(stat);
905 
906   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
907     CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
908     CsrMatrix *matrixT= new CsrMatrix;
909     matrixT->num_rows = A->rmap->n;
910     matrixT->num_cols = A->cmap->n;
911     matrixT->num_entries = a->nz;
912     matrixT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
913     matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
914     matrixT->values = new THRUSTARRAY(a->nz);
915 
916     /* compute the transpose of the upper triangular factor, i.e. the CSC */
917     indexBase = cusparseGetMatIndexBase(matstruct->descr);
918     stat = cusparse_csr2csc(cusparsestruct->handle, matrix->num_rows,
919                             matrix->num_cols, matrix->num_entries,
920                             matrix->values->data().get(),
921                             matrix->row_offsets->data().get(),
922                             matrix->column_indices->data().get(),
923                             matrixT->values->data().get(),
924                             matrixT->column_indices->data().get(),
925                             matrixT->row_offsets->data().get(),
926                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat);
927 
928     /* assign the pointer */
929     matstructT->mat = matrixT;
930 
931   } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
932 #if CUDA_VERSION>=5000
933     /* First convert HYB to CSR */
934     CsrMatrix *temp= new CsrMatrix;
935     temp->num_rows = A->rmap->n;
936     temp->num_cols = A->cmap->n;
937     temp->num_entries = a->nz;
938     temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
939     temp->column_indices = new THRUSTINTARRAY32(a->nz);
940     temp->values = new THRUSTARRAY(a->nz);
941 
942 
943     stat = cusparse_hyb2csr(cusparsestruct->handle,
944                             matstruct->descr, (cusparseHybMat_t)matstruct->mat,
945                             temp->values->data().get(),
946                             temp->row_offsets->data().get(),
947                             temp->column_indices->data().get());CHKERRCUSP(stat);
948 
949     /* Next, convert CSR to CSC (i.e. the matrix transpose) */
950     CsrMatrix *tempT= new CsrMatrix;
951     tempT->num_rows = A->rmap->n;
952     tempT->num_cols = A->cmap->n;
953     tempT->num_entries = a->nz;
954     tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
955     tempT->column_indices = new THRUSTINTARRAY32(a->nz);
956     tempT->values = new THRUSTARRAY(a->nz);
957 
958     stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
959                             temp->num_cols, temp->num_entries,
960                             temp->values->data().get(),
961                             temp->row_offsets->data().get(),
962                             temp->column_indices->data().get(),
963                             tempT->values->data().get(),
964                             tempT->column_indices->data().get(),
965                             tempT->row_offsets->data().get(),
966                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat);
967 
968     /* Last, convert CSC to HYB */
969     cusparseHybMat_t hybMat;
970     stat = cusparseCreateHybMat(&hybMat);CHKERRCUSP(stat);
971     cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
972       CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
973     stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
974                             matstructT->descr, tempT->values->data().get(),
975                             tempT->row_offsets->data().get(),
976                             tempT->column_indices->data().get(),
977                             hybMat, 0, partition);CHKERRCUSP(stat);
978 
979     /* assign the pointer */
980     matstructT->mat = hybMat;
981 
982     /* delete temporaries */
983     if (tempT) {
984       if (tempT->values) delete (THRUSTARRAY*) tempT->values;
985       if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
986       if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
987       delete (CsrMatrix*) tempT;
988     }
989     if (temp) {
990       if (temp->values) delete (THRUSTARRAY*) temp->values;
991       if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
992       if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
993       delete (CsrMatrix*) temp;
994     }
995 #else
996     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format for the Matrix Transpose (in MatMultTranspose) require CUDA 5.0 or later.");
997 #endif
998   }
999   /* assign the compressed row indices */
1000   matstructT->cprowIndices = new THRUSTINTARRAY;
1001 
1002   /* assign the pointer */
1003   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1004   PetscFunctionReturn(0);
1005 }
1006 
1007 #undef __FUNCT__
1008 #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE"
1009 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1010 {
1011   CUSPARRAY                         *xGPU, *bGPU;
1012   cusparseStatus_t                  stat;
1013   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1014   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1015   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1016   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1017   PetscErrorCode                    ierr;
1018 
1019   PetscFunctionBegin;
1020   /* Analyze the matrix and create the transpose ... on the fly */
1021   if (!loTriFactorT && !upTriFactorT) {
1022     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1023     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1024     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1025   }
1026 
1027   /* Get the GPU pointers */
1028   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1029   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
1030 
1031   /* First, reorder with the row permutation */
1032   thrust::copy(thrust::make_permutation_iterator(bGPU->begin(), cusparseTriFactors->rpermIndices->begin()),
1033                thrust::make_permutation_iterator(bGPU->end(), cusparseTriFactors->rpermIndices->end()),
1034                xGPU->begin());
1035 
1036   /* First, solve U */
1037   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1038                         upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr,
1039                         upTriFactorT->csrMat->values->data().get(),
1040                         upTriFactorT->csrMat->row_offsets->data().get(),
1041                         upTriFactorT->csrMat->column_indices->data().get(),
1042                         upTriFactorT->solveInfo,
1043                         xGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat);
1044 
1045   /* Then, solve L */
1046   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1047                         loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr,
1048                         loTriFactorT->csrMat->values->data().get(),
1049                         loTriFactorT->csrMat->row_offsets->data().get(),
1050                         loTriFactorT->csrMat->column_indices->data().get(),
1051                         loTriFactorT->solveInfo,
1052                         tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat);
1053 
1054   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1055   thrust::copy(thrust::make_permutation_iterator(xGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1056                thrust::make_permutation_iterator(xGPU->end(), cusparseTriFactors->cpermIndices->end()),
1057                tempGPU->begin());
1058 
1059   /* Copy the temporary to the full solution. */
1060   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU->begin());
1061 
1062   /* restore */
1063   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
1064   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1065   ierr = WaitForGPU();CHKERRCUSP(ierr);
1066 
1067   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1068   PetscFunctionReturn(0);
1069 }
1070 
1071 #undef __FUNCT__
1072 #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering"
1073 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1074 {
1075   CUSPARRAY                         *xGPU,*bGPU;
1076   cusparseStatus_t                  stat;
1077   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1078   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1079   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1080   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1081   PetscErrorCode                    ierr;
1082 
1083   PetscFunctionBegin;
1084   /* Analyze the matrix and create the transpose ... on the fly */
1085   if (!loTriFactorT && !upTriFactorT) {
1086     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1087     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1088     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1089   }
1090 
1091   /* Get the GPU pointers */
1092   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1093   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
1094 
1095   /* First, solve U */
1096   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1097                         upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr,
1098                         upTriFactorT->csrMat->values->data().get(),
1099                         upTriFactorT->csrMat->row_offsets->data().get(),
1100                         upTriFactorT->csrMat->column_indices->data().get(),
1101                         upTriFactorT->solveInfo,
1102                         bGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat);
1103 
1104   /* Then, solve L */
1105   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1106                         loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr,
1107                         loTriFactorT->csrMat->values->data().get(),
1108                         loTriFactorT->csrMat->row_offsets->data().get(),
1109                         loTriFactorT->csrMat->column_indices->data().get(),
1110                         loTriFactorT->solveInfo,
1111                         tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat);
1112 
1113   /* restore */
1114   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
1115   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1116   ierr = WaitForGPU();CHKERRCUSP(ierr);
1117   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1118   PetscFunctionReturn(0);
1119 }
1120 
1121 #undef __FUNCT__
1122 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE"
1123 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1124 {
1125   CUSPARRAY                         *xGPU,*bGPU;
1126   cusparseStatus_t                  stat;
1127   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1128   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1129   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1130   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1131   PetscErrorCode                    ierr;
1132 
1133   PetscFunctionBegin;
1134   /* Get the GPU pointers */
1135   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1136   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
1137 
1138   /* First, reorder with the row permutation */
1139   thrust::copy(thrust::make_permutation_iterator(bGPU->begin(), cusparseTriFactors->rpermIndices->begin()),
1140                thrust::make_permutation_iterator(bGPU->end(), cusparseTriFactors->rpermIndices->end()),
1141                xGPU->begin());
1142 
1143   /* Next, solve L */
1144   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1145                         loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr,
1146                         loTriFactor->csrMat->values->data().get(),
1147                         loTriFactor->csrMat->row_offsets->data().get(),
1148                         loTriFactor->csrMat->column_indices->data().get(),
1149                         loTriFactor->solveInfo,
1150                         xGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat);
1151 
1152   /* Then, solve U */
1153   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1154                         upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr,
1155                         upTriFactor->csrMat->values->data().get(),
1156                         upTriFactor->csrMat->row_offsets->data().get(),
1157                         upTriFactor->csrMat->column_indices->data().get(),
1158                         upTriFactor->solveInfo,
1159                         tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat);
1160 
1161   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1162   thrust::copy(thrust::make_permutation_iterator(xGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1163                thrust::make_permutation_iterator(xGPU->end(), cusparseTriFactors->cpermIndices->end()),
1164                tempGPU->begin());
1165 
1166   /* Copy the temporary to the full solution. */
1167   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU->begin());
1168 
1169   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
1170   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1171   ierr = WaitForGPU();CHKERRCUSP(ierr);
1172   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1173   PetscFunctionReturn(0);
1174 }
1175 
1176 #undef __FUNCT__
1177 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE_NaturalOrdering"
1178 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1179 {
1180   CUSPARRAY                         *xGPU,*bGPU;
1181   cusparseStatus_t                  stat;
1182   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1183   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1184   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1185   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1186   PetscErrorCode                    ierr;
1187 
1188   PetscFunctionBegin;
1189   /* Get the GPU pointers */
1190   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1191   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
1192 
1193   /* First, solve L */
1194   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1195                         loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr,
1196                         loTriFactor->csrMat->values->data().get(),
1197                         loTriFactor->csrMat->row_offsets->data().get(),
1198                         loTriFactor->csrMat->column_indices->data().get(),
1199                         loTriFactor->solveInfo,
1200                         bGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat);
1201 
1202   /* Next, solve U */
1203   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1204                         upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr,
1205                         upTriFactor->csrMat->values->data().get(),
1206                         upTriFactor->csrMat->row_offsets->data().get(),
1207                         upTriFactor->csrMat->column_indices->data().get(),
1208                         upTriFactor->solveInfo,
1209                         tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat);
1210 
1211   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
1212   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1213   ierr = WaitForGPU();CHKERRCUSP(ierr);
1214   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1215   PetscFunctionReturn(0);
1216 }
1217 
1218 #undef __FUNCT__
1219 #define __FUNCT__ "MatSeqAIJCUSPARSECopyToGPU"
1220 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1221 {
1222 
1223   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1224   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1225   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1226   PetscInt                     m = A->rmap->n,*ii,*ridx;
1227   PetscErrorCode               ierr;
1228   cusparseStatus_t             stat;
1229   cudaError_t                  err;
1230 
1231   PetscFunctionBegin;
1232   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
1233     ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1234     if (matstruct) {
1235       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Data structure should not be initialized.");
1236     }
1237     try {
1238       cusparsestruct->nonzerorow=0;
1239       for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0);
1240 
1241       if (a->compressedrow.use) {
1242         m    = a->compressedrow.nrows;
1243         ii   = a->compressedrow.i;
1244         ridx = a->compressedrow.rindex;
1245       } else {
1246         /* Forcing compressed row on the GPU */
1247         int k=0;
1248         ierr = PetscMalloc((cusparsestruct->nonzerorow+1)*sizeof(PetscInt), &ii);CHKERRQ(ierr);
1249         ierr = PetscMalloc((cusparsestruct->nonzerorow)*sizeof(PetscInt), &ridx);CHKERRQ(ierr);
1250         ii[0]=0;
1251         for (int j = 0; j<m; j++) {
1252           if ((a->i[j+1]-a->i[j])>0) {
1253             ii[k]  = a->i[j];
1254             ridx[k]= j;
1255             k++;
1256           }
1257         }
1258         ii[cusparsestruct->nonzerorow] = a->nz;
1259         m = cusparsestruct->nonzerorow;
1260       }
1261 
1262       /* allocate space for the triangular factor information */
1263       matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1264       stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSP(stat);
1265       stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat);
1266       stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSP(stat);
1267 
1268       err = cudaMalloc((void **)&(matstruct->alpha),sizeof(PetscScalar));CHKERRCUSP(err);
1269       err = cudaMemcpy(matstruct->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err);
1270       err = cudaMalloc((void **)&(matstruct->beta),sizeof(PetscScalar));CHKERRCUSP(err);
1271       err = cudaMemcpy(matstruct->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err);
1272       stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSP(stat);
1273 
1274       /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1275       if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1276         /* set the matrix */
1277 	CsrMatrix *matrix= new CsrMatrix;
1278 	matrix->num_rows = A->rmap->n;
1279 	matrix->num_cols = A->cmap->n;
1280 	matrix->num_entries = a->nz;
1281 	matrix->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1282 	matrix->row_offsets->assign(ii, ii + cusparsestruct->nonzerorow+1);
1283 
1284 	matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1285 	matrix->column_indices->assign(a->j, a->j+a->nz);
1286 
1287 	matrix->values = new THRUSTARRAY(a->nz);
1288 	matrix->values->assign(a->a, a->a+a->nz);
1289 
1290         /* assign the pointer */
1291         matstruct->mat = matrix;
1292 
1293       } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1294 #if CUDA_VERSION>=4020
1295 	CsrMatrix *matrix= new CsrMatrix;
1296 	matrix->num_rows = A->rmap->n;
1297 	matrix->num_cols = A->cmap->n;
1298 	matrix->num_entries = a->nz;
1299 	matrix->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1300 	matrix->row_offsets->assign(ii, ii + cusparsestruct->nonzerorow+1);
1301 
1302 	matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1303 	matrix->column_indices->assign(a->j, a->j+a->nz);
1304 
1305 	matrix->values = new THRUSTARRAY(a->nz);
1306 	matrix->values->assign(a->a, a->a+a->nz);
1307 
1308         cusparseHybMat_t hybMat;
1309         stat = cusparseCreateHybMat(&hybMat);CHKERRCUSP(stat);
1310         cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1311           CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1312         stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1313                                 matstruct->descr, matrix->values->data().get(),
1314                                 matrix->row_offsets->data().get(),
1315                                 matrix->column_indices->data().get(),
1316                                 hybMat, 0, partition);CHKERRCUSP(stat);
1317         /* assign the pointer */
1318         matstruct->mat = hybMat;
1319 
1320 	if (matrix) {
1321 	  if (matrix->values) delete (THRUSTARRAY*)matrix->values;
1322 	  if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices;
1323 	  if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets;
1324 	  delete (CsrMatrix*)matrix;
1325 	}
1326 #endif
1327       }
1328 
1329       /* assign the compressed row indices */
1330       matstruct->cprowIndices = new THRUSTINTARRAY(m);
1331       matstruct->cprowIndices->assign(ridx,ridx+m);
1332 
1333       /* assign the pointer */
1334       cusparsestruct->mat = matstruct;
1335 
1336       if (!a->compressedrow.use) {
1337         ierr = PetscFree(ii);CHKERRQ(ierr);
1338         ierr = PetscFree(ridx);CHKERRQ(ierr);
1339       }
1340       cusparsestruct->workVector = new THRUSTARRAY;
1341       cusparsestruct->workVector->resize(m);
1342     } catch(char *ex) {
1343       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1344     }
1345     ierr = WaitForGPU();CHKERRCUSP(ierr);
1346 
1347     A->valid_GPU_matrix = PETSC_CUSP_BOTH;
1348 
1349     ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1350   }
1351   PetscFunctionReturn(0);
1352 }
1353 
1354 #undef __FUNCT__
1355 #define __FUNCT__ "MatGetVecs_SeqAIJCUSPARSE"
1356 static PetscErrorCode MatGetVecs_SeqAIJCUSPARSE(Mat mat, Vec *right, Vec *left)
1357 {
1358   PetscErrorCode ierr;
1359 
1360   PetscFunctionBegin;
1361   if (right) {
1362     ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr);
1363     ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
1364     ierr = VecSetBlockSize(*right,mat->rmap->bs);CHKERRQ(ierr);
1365     ierr = VecSetType(*right,VECSEQCUSP);CHKERRQ(ierr);
1366     ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr);
1367   }
1368   if (left) {
1369     ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr);
1370     ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
1371     ierr = VecSetBlockSize(*left,mat->rmap->bs);CHKERRQ(ierr);
1372     ierr = VecSetType(*left,VECSEQCUSP);CHKERRQ(ierr);
1373     ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr);
1374   }
1375   PetscFunctionReturn(0);
1376 }
1377 
1378 struct VecCUSPPlusEquals
1379 {
1380   template <typename Tuple>
1381   __host__ __device__
1382   void operator()(Tuple t)
1383   {
1384     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1385   }
1386 };
1387 
1388 #undef __FUNCT__
1389 #define __FUNCT__ "MatMult_SeqAIJCUSPARSE"
1390 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1391 {
1392   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1393   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1394   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1395   CUSPARRAY                    *xarray,*yarray;
1396   PetscErrorCode               ierr;
1397   cusparseStatus_t             stat;
1398 
1399   PetscFunctionBegin;
1400   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
1401      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
1402   ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1403   ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
1404   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1405     CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1406     stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1407                              mat->num_rows, mat->num_cols, mat->num_entries,
1408                              matstruct->alpha, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(),
1409                              mat->column_indices->data().get(), xarray->data().get(), matstruct->beta,
1410                              yarray->data().get());CHKERRCUSP(stat);
1411   } else {
1412 #if CUDA_VERSION>=4020
1413     cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1414     stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1415                              matstruct->alpha, matstruct->descr, hybMat,
1416                              xarray->data().get(), matstruct->beta,
1417                              yarray->data().get());CHKERRCUSP(stat);
1418 #endif
1419   }
1420   ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1421   ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
1422   if (!cusparsestruct->stream) {
1423     ierr = WaitForGPU();CHKERRCUSP(ierr);
1424   }
1425   ierr = PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr);
1426   PetscFunctionReturn(0);
1427 }
1428 
1429 #undef __FUNCT__
1430 #define __FUNCT__ "MatMultTranspose_SeqAIJCUSPARSE"
1431 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1432 {
1433   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1434   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1435   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1436   CUSPARRAY                    *xarray,*yarray;
1437   PetscErrorCode               ierr;
1438   cusparseStatus_t             stat;
1439 
1440   PetscFunctionBegin;
1441   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
1442      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
1443   if (!matstructT) {
1444     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1445     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1446   }
1447   ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1448   ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
1449 
1450   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1451     CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1452     stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1453                              mat->num_rows, mat->num_cols,
1454                              mat->num_entries, matstructT->alpha, matstructT->descr,
1455                              mat->values->data().get(), mat->row_offsets->data().get(),
1456                              mat->column_indices->data().get(), xarray->data().get(), matstructT->beta,
1457                              yarray->data().get());CHKERRCUSP(stat);
1458   } else {
1459 #if CUDA_VERSION>=4020
1460     cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1461     stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1462                              matstructT->alpha, matstructT->descr, hybMat,
1463                              xarray->data().get(), matstructT->beta,
1464                              yarray->data().get());CHKERRCUSP(stat);
1465 #endif
1466   }
1467   ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1468   ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
1469   if (!cusparsestruct->stream) {
1470     ierr = WaitForGPU();CHKERRCUSP(ierr);
1471   }
1472   ierr = PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr);
1473   PetscFunctionReturn(0);
1474 }
1475 
1476 
1477 #undef __FUNCT__
1478 #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE"
1479 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1480 {
1481   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1482   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1483   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1484   CUSPARRAY                    *xarray,*yarray,*zarray;
1485   PetscErrorCode               ierr;
1486   cusparseStatus_t             stat;
1487 
1488   PetscFunctionBegin;
1489   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
1490      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
1491   try {
1492     ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr);
1493     ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1494     ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr);
1495     ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
1496 
1497     /* multiply add */
1498     if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1499       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1500       /* here we need to be careful to set the number of rows in the multiply to the
1501 	 number of compressed rows in the matrix ... which is equivalent to the
1502 	 size of the workVector */
1503       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1504                                cusparsestruct->workVector->size(), mat->num_cols,
1505                                mat->num_entries, matstruct->alpha, matstruct->descr,
1506                                mat->values->data().get(), mat->row_offsets->data().get(),
1507                                mat->column_indices->data().get(), xarray->data().get(), matstruct->beta,
1508                                cusparsestruct->workVector->data().get());CHKERRCUSP(stat);
1509     } else {
1510 #if CUDA_VERSION>=4020
1511       cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1512       stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1513 			       matstruct->alpha, matstruct->descr, hybMat,
1514 			       xarray->data().get(), matstruct->beta,
1515 			       cusparsestruct->workVector->data().get());CHKERRCUSP(stat);
1516 #endif
1517     }
1518 
1519     /* scatter the data from the temporary into the full vector with a += operation */
1520     thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstruct->cprowIndices->begin()))),
1521 		     thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(),
1522 		     VecCUSPPlusEquals());
1523     ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1524     ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr);
1525     ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
1526 
1527   } catch(char *ex) {
1528     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1529   }
1530   ierr = WaitForGPU();CHKERRCUSP(ierr);
1531   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1532   PetscFunctionReturn(0);
1533 }
1534 
1535 #undef __FUNCT__
1536 #define __FUNCT__ "MatMultTransposeAdd_SeqAIJCUSPARSE"
1537 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1538 {
1539   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1540   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1541   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1542   CUSPARRAY                    *xarray,*yarray,*zarray;
1543   PetscErrorCode               ierr;
1544   cusparseStatus_t             stat;
1545 
1546   PetscFunctionBegin;
1547   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
1548      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
1549   if (!matstructT) {
1550     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1551     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1552   }
1553 
1554   try {
1555     ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr);
1556     ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1557     ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr);
1558     ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
1559 
1560     /* multiply add with matrix transpose */
1561     if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1562       CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1563       /* here we need to be careful to set the number of rows in the multiply to the
1564 	 number of compressed rows in the matrix ... which is equivalent to the
1565 	 size of the workVector */
1566       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1567                                cusparsestruct->workVector->size(), mat->num_cols,
1568 			       mat->num_entries, matstructT->alpha, matstructT->descr,
1569                                mat->values->data().get(), mat->row_offsets->data().get(),
1570                                mat->column_indices->data().get(), xarray->data().get(), matstructT->beta,
1571                                cusparsestruct->workVector->data().get());CHKERRCUSP(stat);
1572     } else {
1573 #if CUDA_VERSION>=4020
1574       cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1575       stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1576 			       matstructT->alpha, matstructT->descr, hybMat,
1577 			       xarray->data().get(), matstructT->beta,
1578 			       cusparsestruct->workVector->data().get());CHKERRCUSP(stat);
1579 #endif
1580     }
1581 
1582     /* scatter the data from the temporary into the full vector with a += operation */
1583     thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstructT->cprowIndices->begin()))),
1584 		     thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstructT->cprowIndices->begin()))) + cusparsestruct->workVector->size(),
1585 		     VecCUSPPlusEquals());
1586 
1587     ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1588     ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr);
1589     ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
1590 
1591   } catch(char *ex) {
1592     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1593   }
1594   ierr = WaitForGPU();CHKERRCUSP(ierr);
1595   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1596   PetscFunctionReturn(0);
1597 }
1598 
1599 #undef __FUNCT__
1600 #define __FUNCT__ "MatAssemblyEnd_SeqAIJCUSPARSE"
1601 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
1602 {
1603   PetscErrorCode ierr;
1604 
1605   PetscFunctionBegin;
1606   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
1607   if (A->factortype==MAT_FACTOR_NONE) {
1608     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1609   }
1610   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
1611   A->ops->mult             = MatMult_SeqAIJCUSPARSE;
1612   A->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1613   A->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1614   A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1615   PetscFunctionReturn(0);
1616 }
1617 
1618 /* --------------------------------------------------------------------------------*/
1619 #undef __FUNCT__
1620 #define __FUNCT__ "MatCreateSeqAIJCUSPARSE"
1621 /*@
1622    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
1623    (the default parallel PETSc format). This matrix will ultimately pushed down
1624    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
1625    assembly performance the user should preallocate the matrix storage by setting
1626    the parameter nz (or the array nnz).  By setting these parameters accurately,
1627    performance during matrix assembly can be increased by more than a factor of 50.
1628 
1629    Collective on MPI_Comm
1630 
1631    Input Parameters:
1632 +  comm - MPI communicator, set to PETSC_COMM_SELF
1633 .  m - number of rows
1634 .  n - number of columns
1635 .  nz - number of nonzeros per row (same for all rows)
1636 -  nnz - array containing the number of nonzeros in the various rows
1637          (possibly different for each row) or NULL
1638 
1639    Output Parameter:
1640 .  A - the matrix
1641 
1642    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1643    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1644    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
1645 
1646    Notes:
1647    If nnz is given then nz is ignored
1648 
1649    The AIJ format (also called the Yale sparse matrix format or
1650    compressed row storage), is fully compatible with standard Fortran 77
1651    storage.  That is, the stored row and column indices can begin at
1652    either one (as in Fortran) or zero.  See the users' manual for details.
1653 
1654    Specify the preallocated storage with either nz or nnz (not both).
1655    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1656    allocation.  For large problems you MUST preallocate memory or you
1657    will get TERRIBLE performance, see the users' manual chapter on matrices.
1658 
1659    By default, this format uses inodes (identical nodes) when possible, to
1660    improve numerical efficiency of matrix-vector products and solves. We
1661    search for consecutive rows with the same nonzero structure, thereby
1662    reusing matrix information to achieve increased efficiency.
1663 
1664    Level: intermediate
1665 
1666 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
1667 @*/
1668 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1669 {
1670   PetscErrorCode ierr;
1671 
1672   PetscFunctionBegin;
1673   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1674   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
1675   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1676   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
1677   PetscFunctionReturn(0);
1678 }
1679 
1680 #undef __FUNCT__
1681 #define __FUNCT__ "MatDestroy_SeqAIJCUSPARSE"
1682 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
1683 {
1684   PetscErrorCode   ierr;
1685   cusparseStatus_t stat;
1686   cudaError_t      err;
1687   PetscFunctionBegin;
1688   if (A->factortype==MAT_FACTOR_NONE) {
1689     try {
1690       if (A->valid_GPU_matrix != PETSC_CUSP_UNALLOCATED) {
1691         Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1692         Mat_SeqAIJCUSPARSEMultStruct *matstruct  = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1693         Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1694 
1695         /* delete all the members in each of the data structures */
1696         if (matstruct) {
1697           if (matstruct->descr) { stat = cusparseDestroyMatDescr(matstruct->descr);CHKERRCUSP(stat); }
1698           if (matstruct->mat) {
1699             if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1700 #if CUDA_VERSION>=4020
1701               cusparseHybMat_t hybMat = (cusparseHybMat_t) matstruct->mat;
1702               stat = cusparseDestroyHybMat(hybMat);CHKERRCUSP(stat);
1703 #endif
1704             } else {
1705 	      CsrMatrix* mat = (CsrMatrix*)matstruct->mat;
1706 	      if (mat->values) delete (THRUSTARRAY*)mat->values;
1707 	      if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1708 	      if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1709               delete (CsrMatrix*)matstruct->mat;
1710             }
1711           }
1712 	  if (matstruct->alpha) { err=cudaFree(matstruct->alpha);CHKERRCUSP(err); }
1713 	  if (matstruct->beta) { err=cudaFree(matstruct->beta);CHKERRCUSP(err); }
1714           if (matstruct->cprowIndices) delete (THRUSTINTARRAY*)matstruct->cprowIndices;
1715           if (matstruct) delete (Mat_SeqAIJCUSPARSEMultStruct*)matstruct;
1716         }
1717         if (matstructT) {
1718           if (matstructT->descr) { stat = cusparseDestroyMatDescr(matstructT->descr);CHKERRCUSP(stat); }
1719           if (matstructT->mat) {
1720             if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1721 #if CUDA_VERSION>=4020
1722               cusparseHybMat_t hybMat = (cusparseHybMat_t) matstructT->mat;
1723               stat = cusparseDestroyHybMat(hybMat);CHKERRCUSP(stat);
1724 #endif
1725             } else {
1726 	      CsrMatrix* matT = (CsrMatrix*)matstructT->mat;
1727 	      if (matT->values) delete (THRUSTARRAY*)matT->values;
1728 	      if (matT->column_indices) delete (THRUSTINTARRAY32*)matT->column_indices;
1729 	      if (matT->row_offsets) delete (THRUSTINTARRAY32*)matT->row_offsets;
1730               delete (CsrMatrix*)matstructT->mat;
1731             }
1732           }
1733 	  if (matstructT->alpha) { err=cudaFree(matstructT->alpha);CHKERRCUSP(err); }
1734 	  if (matstructT->beta) { err=cudaFree(matstructT->beta);CHKERRCUSP(err); }
1735           if (matstructT->cprowIndices) delete (THRUSTINTARRAY*)matstructT->cprowIndices;
1736           if (matstructT) delete (Mat_SeqAIJCUSPARSEMultStruct*)matstructT;
1737         }
1738         if (cusparsestruct->workVector) delete (THRUSTARRAY*)cusparsestruct->workVector;
1739         if (cusparsestruct->handle) { stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSP(stat); }
1740         if (cusparsestruct) delete (Mat_SeqAIJCUSPARSE*)cusparsestruct;
1741         A->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED;
1742       } else {
1743         Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1744         if (cusparsestruct->handle) { stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSP(stat); }
1745         if (cusparsestruct) delete (Mat_SeqAIJCUSPARSE*)cusparsestruct;
1746       }
1747     } catch(char *ex) {
1748       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1749     }
1750   } else {
1751     /* The triangular factors */
1752     try {
1753       Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1754       Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1755       Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1756       Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1757       Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1758 
1759       /* delete all the members in each of the data structures */
1760       if (loTriFactor) {
1761         if (loTriFactor->descr) { stat = cusparseDestroyMatDescr(loTriFactor->descr);CHKERRCUSP(stat); }
1762         if (loTriFactor->solveInfo) { stat = cusparseDestroySolveAnalysisInfo(loTriFactor->solveInfo);CHKERRCUSP(stat); }
1763         if (loTriFactor->csrMat) {
1764 	  if (loTriFactor->csrMat->values) delete (THRUSTARRAY*)loTriFactor->csrMat->values;
1765 	  if (loTriFactor->csrMat->column_indices) delete (THRUSTINTARRAY32*)loTriFactor->csrMat->column_indices;
1766 	  if (loTriFactor->csrMat->row_offsets) delete (THRUSTINTARRAY32*)loTriFactor->csrMat->row_offsets;
1767 	  delete (CsrMatrix*)loTriFactor->csrMat;
1768 	}
1769         if (loTriFactor) delete (Mat_SeqAIJCUSPARSETriFactorStruct*)loTriFactor;
1770       }
1771 
1772       if (upTriFactor) {
1773         if (upTriFactor->descr) { stat = cusparseDestroyMatDescr(upTriFactor->descr);CHKERRCUSP(stat); }
1774         if (upTriFactor->solveInfo) { stat = cusparseDestroySolveAnalysisInfo(upTriFactor->solveInfo);CHKERRCUSP(stat); }
1775         if (upTriFactor->csrMat) {
1776 	  if (upTriFactor->csrMat->values) delete (THRUSTARRAY*)upTriFactor->csrMat->values;
1777 	  if (upTriFactor->csrMat->column_indices) delete (THRUSTINTARRAY32*)upTriFactor->csrMat->column_indices;
1778 	  if (upTriFactor->csrMat->row_offsets) delete (THRUSTINTARRAY32*)upTriFactor->csrMat->row_offsets;
1779 	  delete (CsrMatrix*)upTriFactor->csrMat;
1780 	}
1781         if (upTriFactor) delete (Mat_SeqAIJCUSPARSETriFactorStruct*)upTriFactor;
1782       }
1783 
1784       if (loTriFactorT) {
1785         if (loTriFactorT->descr) { stat = cusparseDestroyMatDescr(loTriFactorT->descr);CHKERRCUSP(stat); }
1786         if (loTriFactorT->solveInfo) { stat = cusparseDestroySolveAnalysisInfo(loTriFactorT->solveInfo);CHKERRCUSP(stat); }
1787         if (loTriFactorT->csrMat) {
1788 	  if (loTriFactorT->csrMat->values) delete (THRUSTARRAY*)loTriFactorT->csrMat->values;
1789 	  if (loTriFactorT->csrMat->column_indices) delete (THRUSTINTARRAY32*)loTriFactorT->csrMat->column_indices;
1790 	  if (loTriFactorT->csrMat->row_offsets) delete (THRUSTINTARRAY32*)loTriFactorT->csrMat->row_offsets;
1791 	  delete (CsrMatrix*)loTriFactorT->csrMat;
1792 	}
1793         if (loTriFactorT) delete (Mat_SeqAIJCUSPARSETriFactorStruct*)loTriFactorT;
1794       }
1795 
1796       if (upTriFactorT) {
1797         if (upTriFactorT->descr) { stat = cusparseDestroyMatDescr(upTriFactorT->descr);CHKERRCUSP(stat); }
1798         if (upTriFactorT->solveInfo) { stat = cusparseDestroySolveAnalysisInfo(upTriFactorT->solveInfo);CHKERRCUSP(stat); }
1799         if (upTriFactorT->csrMat) {
1800 	  if (upTriFactorT->csrMat->values) delete (THRUSTARRAY*)upTriFactorT->csrMat->values;
1801 	  if (upTriFactorT->csrMat->column_indices) delete (THRUSTINTARRAY32*)upTriFactorT->csrMat->column_indices;
1802 	  if (upTriFactorT->csrMat->row_offsets) delete (THRUSTINTARRAY32*)upTriFactorT->csrMat->row_offsets;
1803 	  delete (CsrMatrix*)upTriFactorT->csrMat;
1804 	}
1805         if (upTriFactorT) delete (Mat_SeqAIJCUSPARSETriFactorStruct*)upTriFactorT;
1806       }
1807       if (cusparseTriFactors->rpermIndices) delete (THRUSTINTARRAY*)cusparseTriFactors->rpermIndices;
1808       if (cusparseTriFactors->cpermIndices) delete (THRUSTINTARRAY*)cusparseTriFactors->cpermIndices;
1809       if (cusparseTriFactors->workVector) delete (THRUSTARRAY*)cusparseTriFactors->workVector;
1810       if (cusparseTriFactors->handle) { stat = cusparseDestroy(cusparseTriFactors->handle);CHKERRCUSP(stat); }
1811       if (cusparseTriFactors) delete (Mat_SeqAIJCUSPARSETriFactors*)cusparseTriFactors;
1812     } catch(char *ex) {
1813       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1814     }
1815   }
1816   /*this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */
1817   A->spptr = 0;
1818 
1819   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
1820   PetscFunctionReturn(0);
1821 }
1822 
1823 #undef __FUNCT__
1824 #define __FUNCT__ "MatCreate_SeqAIJCUSPARSE"
1825 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
1826 {
1827   PetscErrorCode ierr;
1828   cusparseStatus_t stat;
1829   cusparseHandle_t handle=0;
1830 
1831   PetscFunctionBegin;
1832   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
1833   if (B->factortype==MAT_FACTOR_NONE) {
1834     /* you cannot check the inode.use flag here since the matrix was just created.
1835        now build a GPU matrix data structure */
1836     B->spptr = new Mat_SeqAIJCUSPARSE;
1837     ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat          = 0;
1838     ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0;
1839     ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector   = 0;
1840     ((Mat_SeqAIJCUSPARSE*)B->spptr)->format       = MAT_CUSPARSE_CSR;
1841     ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream       = 0;
1842     ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle       = 0;
1843     stat = cusparseCreate(&handle);CHKERRCUSP(stat);
1844     ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle       = handle;
1845     ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream       = 0;
1846   } else {
1847     /* NEXT, set the pointers to the triangular factors */
1848     B->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1849     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr          = 0;
1850     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr          = 0;
1851     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0;
1852     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0;
1853     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices            = 0;
1854     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices            = 0;
1855     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector              = 0;
1856     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->format                  = cusparseMatSolveStorageFormat;
1857     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle                  = 0;
1858     stat = cusparseCreate(&handle);CHKERRCUSP(stat);
1859     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle                  = handle;
1860     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz                     = 0;
1861   }
1862 
1863   /* Here we overload MatGetFactor_petsc_C which enables -mat_type aijcusparse to use the
1864      default cusparse tri solve. Note the difference with the implementation in
1865      MatCreate_SeqAIJCUSP in ../seqcusp/aijcusp.cu */
1866   ierr = PetscObjectComposeFunction((PetscObject)B,"MatGetFactor_petsc_C",MatGetFactor_seqaij_cusparse);CHKERRQ(ierr);
1867 
1868   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
1869   B->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
1870   B->ops->getvecs          = MatGetVecs_SeqAIJCUSPARSE;
1871   B->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
1872   B->ops->mult             = MatMult_SeqAIJCUSPARSE;
1873   B->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1874   B->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1875   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1876 
1877   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1878 
1879   B->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED;
1880 
1881   ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
1882   PetscFunctionReturn(0);
1883 }
1884 
1885 /*M
1886    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
1887 
1888    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
1889    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
1890    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
1891 
1892    Options Database Keys:
1893 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
1894 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1895 .  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1896 -  -mat_cusparse_solve_storage_format csr - sets the storage format matrices (for factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1897 
1898   Level: beginner
1899 
1900 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
1901 M*/
1902