xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 44b85a236d0c752951b0573ba76bfb3134d48c1e)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 
6 #include <petscconf.h>
7 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
8 #include <../src/mat/impls/sbaij/seq/sbaij.h>
9 #include <../src/vec/vec/impls/dvecimpl.h>
10 #include <petsc-private/vecimpl.h>
11 #undef VecType
12 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
13 
14 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
15 
16 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
17 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
18 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
19 
20 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
21 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
22 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
23 
24 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
25 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
26 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
27 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
28 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat);
29 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
30 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
31 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
32 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
33 
34 #undef __FUNCT__
35 #define __FUNCT__ "MatCUSPARSESetStream"
36 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
37 {
38   cusparseStatus_t   stat;
39   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
40 
41   PetscFunctionBegin;
42   cusparsestruct->stream = stream;
43   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSP(stat);
44   PetscFunctionReturn(0);
45 }
46 
47 #undef __FUNCT__
48 #define __FUNCT__ "MatCUSPARSESetHandle"
49 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
50 {
51   cusparseStatus_t   stat;
52   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
53 
54   PetscFunctionBegin;
55   if (cusparsestruct->handle)
56     stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSP(stat);
57   cusparsestruct->handle = handle;
58   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSP(stat);
59   PetscFunctionReturn(0);
60 }
61 
62 #undef __FUNCT__
63 #define __FUNCT__ "MatCUSPARSEClearHandle"
64 PetscErrorCode MatCUSPARSEClearHandle(Mat A)
65 {
66   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
67   PetscFunctionBegin;
68   if (cusparsestruct->handle)
69     cusparsestruct->handle = 0;
70   PetscFunctionReturn(0);
71 }
72 
73 #undef __FUNCT__
74 #define __FUNCT__ "MatFactorGetSolverPackage_seqaij_cusparse"
75 PetscErrorCode MatFactorGetSolverPackage_seqaij_cusparse(Mat A,const MatSolverPackage *type)
76 {
77   PetscFunctionBegin;
78   *type = MATSOLVERCUSPARSE;
79   PetscFunctionReturn(0);
80 }
81 
82 /*MC
83   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
84   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
85   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
86   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
87   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
88   algorithms are not recommended. This class does NOT support direct solver operations.
89 
90   Level: beginner
91 
92 .seealso: PCFactorSetMatSolverPackage(), MatSolverPackage, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
93 M*/
94 
95 #undef __FUNCT__
96 #define __FUNCT__ "MatGetFactor_seqaij_cusparse"
97 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaij_cusparse(Mat A,MatFactorType ftype,Mat *B)
98 {
99   PetscErrorCode ierr;
100   PetscInt       n = A->rmap->n;
101 
102   PetscFunctionBegin;
103   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
104   (*B)->factortype = ftype;
105   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
106   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
107 
108   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
109     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
110     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
111     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
112   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
113     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
114     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
115   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
116 
117   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
118   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverPackage_C",MatFactorGetSolverPackage_seqaij_cusparse);CHKERRQ(ierr);
119   PetscFunctionReturn(0);
120 }
121 
122 #undef __FUNCT__
123 #define __FUNCT__ "MatCUSPARSESetFormat_SeqAIJCUSPARSE"
124 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
125 {
126   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
127 
128   PetscFunctionBegin;
129 #if CUDA_VERSION>=4020
130   switch (op) {
131   case MAT_CUSPARSE_MULT:
132     cusparsestruct->format = format;
133     break;
134   case MAT_CUSPARSE_ALL:
135     cusparsestruct->format = format;
136     break;
137   default:
138     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
139   }
140 #else
141   if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB)
142     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format require CUDA 4.2 or later.");
143 #endif
144   PetscFunctionReturn(0);
145 }
146 
147 /*@
148    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
149    operation. Only the MatMult operation can use different GPU storage formats
150    for MPIAIJCUSPARSE matrices.
151    Not Collective
152 
153    Input Parameters:
154 +  A - Matrix of type SEQAIJCUSPARSE
155 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
156 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
157 
158    Output Parameter:
159 
160    Level: intermediate
161 
162 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
163 @*/
164 #undef __FUNCT__
165 #define __FUNCT__ "MatCUSPARSESetFormat"
166 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
167 {
168   PetscErrorCode ierr;
169 
170   PetscFunctionBegin;
171   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
172   ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
173   PetscFunctionReturn(0);
174 }
175 
176 #undef __FUNCT__
177 #define __FUNCT__ "MatSetFromOptions_SeqAIJCUSPARSE"
178 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A)
179 {
180   PetscErrorCode           ierr;
181   MatCUSPARSEStorageFormat format;
182   PetscBool                flg;
183 
184   PetscFunctionBegin;
185   ierr = PetscOptionsHead("SeqAIJCUSPARSE options");CHKERRQ(ierr);
186   ierr = PetscObjectOptionsBegin((PetscObject)A);
187   if (A->factortype==MAT_FACTOR_NONE) {
188     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
189                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
190     if (flg) {
191       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);
192     }
193   }
194   ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
195                           "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
196   if (flg) {
197     ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
198   }
199   ierr = PetscOptionsEnd();CHKERRQ(ierr);
200   PetscFunctionReturn(0);
201 
202 }
203 
204 #undef __FUNCT__
205 #define __FUNCT__ "MatILUFactorSymbolic_SeqAIJCUSPARSE"
206 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
207 {
208   PetscErrorCode ierr;
209 
210   PetscFunctionBegin;
211   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
212   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
213   PetscFunctionReturn(0);
214 }
215 
216 #undef __FUNCT__
217 #define __FUNCT__ "MatLUFactorSymbolic_SeqAIJCUSPARSE"
218 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
219 {
220   PetscErrorCode ierr;
221 
222   PetscFunctionBegin;
223   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
224   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
225   PetscFunctionReturn(0);
226 }
227 
228 #undef __FUNCT__
229 #define __FUNCT__ "MatICCFactorSymbolic_SeqAIJCUSPARSE"
230 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
231 {
232   PetscErrorCode ierr;
233 
234   PetscFunctionBegin;
235   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
236   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
237   PetscFunctionReturn(0);
238 }
239 
240 #undef __FUNCT__
241 #define __FUNCT__ "MatCholeskyFactorSymbolic_SeqAIJCUSPARSE"
242 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
243 {
244   PetscErrorCode ierr;
245 
246   PetscFunctionBegin;
247   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
248   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
249   PetscFunctionReturn(0);
250 }
251 
252 #undef __FUNCT__
253 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildILULowerTriMatrix"
254 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
255 {
256   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
257   PetscInt                          n = A->rmap->n;
258   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
259   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
260   cusparseStatus_t                  stat;
261   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
262   const MatScalar                   *aa = a->a,*v;
263   PetscInt                          *AiLo, *AjLo;
264   PetscScalar                       *AALo;
265   PetscInt                          i,nz, nzLower, offset, rowOffset;
266   PetscErrorCode                    ierr;
267 
268   PetscFunctionBegin;
269   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
270     try {
271       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
272       nzLower=n+ai[n]-ai[1];
273 
274       /* Allocate Space for the lower triangular matrix */
275       ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
276       ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUSP(ierr);
277       ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUSP(ierr);
278 
279       /* Fill the lower triangular matrix */
280       AiLo[0]  = (PetscInt) 0;
281       AiLo[n]  = nzLower;
282       AjLo[0]  = (PetscInt) 0;
283       AALo[0]  = (MatScalar) 1.0;
284       v        = aa;
285       vi       = aj;
286       offset   = 1;
287       rowOffset= 1;
288       for (i=1; i<n; i++) {
289         nz = ai[i+1] - ai[i];
290         /* additional 1 for the term on the diagonal */
291         AiLo[i]    = rowOffset;
292         rowOffset += nz+1;
293 
294         ierr = PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr);
295         ierr = PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
296 
297         offset      += nz;
298         AjLo[offset] = (PetscInt) i;
299         AALo[offset] = (MatScalar) 1.0;
300         offset      += 1;
301 
302         v  += nz;
303         vi += nz;
304       }
305 
306       /* allocate space for the triangular factor information */
307       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
308 
309       /* Create the matrix description */
310       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSP(stat);
311       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat);
312       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat);
313       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSP(stat);
314       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSP(stat);
315 
316       /* Create the solve analysis information */
317       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSP(stat);
318 
319       /* set the operation */
320       loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
321 
322       /* set the matrix */
323       loTriFactor->csrMat = new CsrMatrix;
324       loTriFactor->csrMat->num_rows = n;
325       loTriFactor->csrMat->num_cols = n;
326       loTriFactor->csrMat->num_entries = nzLower;
327 
328       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
329       loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
330 
331       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
332       loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
333 
334       loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
335       loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
336 
337       /* perform the solve analysis */
338       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
339                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
340                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
341                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSP(stat);
342 
343       /* assign the pointer. Is this really necessary? */
344       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
345 
346       ierr = cudaFreeHost(AiLo);CHKERRCUSP(ierr);
347       ierr = cudaFreeHost(AjLo);CHKERRCUSP(ierr);
348       ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr);
349     } catch(char *ex) {
350       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
351     }
352   }
353   PetscFunctionReturn(0);
354 }
355 
356 #undef __FUNCT__
357 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildILUUpperTriMatrix"
358 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
359 {
360   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
361   PetscInt                          n = A->rmap->n;
362   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
363   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
364   cusparseStatus_t                  stat;
365   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
366   const MatScalar                   *aa = a->a,*v;
367   PetscInt                          *AiUp, *AjUp;
368   PetscScalar                       *AAUp;
369   PetscInt                          i,nz, nzUpper, offset;
370   PetscErrorCode                    ierr;
371 
372   PetscFunctionBegin;
373   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
374     try {
375       /* next, figure out the number of nonzeros in the upper triangular matrix. */
376       nzUpper = adiag[0]-adiag[n];
377 
378       /* Allocate Space for the upper triangular matrix */
379       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
380       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr);
381       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr);
382 
383       /* Fill the upper triangular matrix */
384       AiUp[0]=(PetscInt) 0;
385       AiUp[n]=nzUpper;
386       offset = nzUpper;
387       for (i=n-1; i>=0; i--) {
388         v  = aa + adiag[i+1] + 1;
389         vi = aj + adiag[i+1] + 1;
390 
391         /* number of elements NOT on the diagonal */
392         nz = adiag[i] - adiag[i+1]-1;
393 
394         /* decrement the offset */
395         offset -= (nz+1);
396 
397         /* first, set the diagonal elements */
398         AjUp[offset] = (PetscInt) i;
399         AAUp[offset] = 1./v[nz];
400         AiUp[i]      = AiUp[i+1] - (nz+1);
401 
402         ierr = PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr);
403         ierr = PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
404       }
405 
406       /* allocate space for the triangular factor information */
407       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
408 
409       /* Create the matrix description */
410       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSP(stat);
411       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat);
412       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat);
413       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat);
414       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat);
415 
416       /* Create the solve analysis information */
417       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSP(stat);
418 
419       /* set the operation */
420       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
421 
422       /* set the matrix */
423       upTriFactor->csrMat = new CsrMatrix;
424       upTriFactor->csrMat->num_rows = n;
425       upTriFactor->csrMat->num_cols = n;
426       upTriFactor->csrMat->num_entries = nzUpper;
427 
428       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
429       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
430 
431       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
432       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
433 
434       upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
435       upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
436 
437       /* perform the solve analysis */
438       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
439                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
440                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
441                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSP(stat);
442 
443       /* assign the pointer. Is this really necessary? */
444       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
445 
446       ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr);
447       ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr);
448       ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr);
449     } catch(char *ex) {
450       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
451     }
452   }
453   PetscFunctionReturn(0);
454 }
455 
456 #undef __FUNCT__
457 #define __FUNCT__ "MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU"
458 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
459 {
460   PetscErrorCode               ierr;
461   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
462   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
463   IS                           isrow = a->row,iscol = a->icol;
464   PetscBool                    row_identity,col_identity;
465   const PetscInt               *r,*c;
466   PetscInt                     n = A->rmap->n;
467 
468   PetscFunctionBegin;
469   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
470   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
471 
472   cusparseTriFactors->workVector = new THRUSTARRAY;
473   cusparseTriFactors->workVector->resize(n);
474   cusparseTriFactors->nnz=a->nz;
475 
476   A->valid_GPU_matrix = PETSC_CUSP_BOTH;
477   /*lower triangular indices */
478   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
479   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
480   if (!row_identity) {
481     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
482     cusparseTriFactors->rpermIndices->assign(r, r+n);
483   }
484   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
485 
486   /*upper triangular indices */
487   ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
488   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
489   if (!col_identity) {
490     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
491     cusparseTriFactors->cpermIndices->assign(c, c+n);
492   }
493   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
494   PetscFunctionReturn(0);
495 }
496 
497 #undef __FUNCT__
498 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildICCTriMatrices"
499 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
500 {
501   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
502   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
503   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
504   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
505   cusparseStatus_t                  stat;
506   PetscErrorCode                    ierr;
507   PetscInt                          *AiUp, *AjUp;
508   PetscScalar                       *AAUp;
509   PetscScalar                       *AALo;
510   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
511   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
512   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
513   const MatScalar                   *aa = b->a,*v;
514 
515   PetscFunctionBegin;
516   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
517     try {
518       /* Allocate Space for the upper triangular matrix */
519       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
520       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr);
521       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr);
522       ierr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr);
523 
524       /* Fill the upper triangular matrix */
525       AiUp[0]=(PetscInt) 0;
526       AiUp[n]=nzUpper;
527       offset = 0;
528       for (i=0; i<n; i++) {
529         /* set the pointers */
530         v  = aa + ai[i];
531         vj = aj + ai[i];
532         nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
533 
534         /* first, set the diagonal elements */
535         AjUp[offset] = (PetscInt) i;
536         AAUp[offset] = 1.0/v[nz];
537         AiUp[i]      = offset;
538         AALo[offset] = 1.0/v[nz];
539 
540         offset+=1;
541         if (nz>0) {
542           ierr = PetscMemcpy(&(AjUp[offset]), vj, nz*sizeof(PetscInt));CHKERRQ(ierr);
543           ierr = PetscMemcpy(&(AAUp[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
544           for (j=offset; j<offset+nz; j++) {
545             AAUp[j] = -AAUp[j];
546             AALo[j] = AAUp[j]/v[nz];
547           }
548           offset+=nz;
549         }
550       }
551 
552       /* allocate space for the triangular factor information */
553       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
554 
555       /* Create the matrix description */
556       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSP(stat);
557       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat);
558       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat);
559       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat);
560       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSP(stat);
561 
562       /* Create the solve analysis information */
563       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSP(stat);
564 
565       /* set the operation */
566       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
567 
568       /* set the matrix */
569       upTriFactor->csrMat = new CsrMatrix;
570       upTriFactor->csrMat->num_rows = A->rmap->n;
571       upTriFactor->csrMat->num_cols = A->cmap->n;
572       upTriFactor->csrMat->num_entries = a->nz;
573 
574       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
575       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
576 
577       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
578       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
579 
580       upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
581       upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
582 
583       /* perform the solve analysis */
584       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
585                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
586                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
587                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSP(stat);
588 
589       /* assign the pointer. Is this really necessary? */
590       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
591 
592       /* allocate space for the triangular factor information */
593       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
594 
595       /* Create the matrix description */
596       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSP(stat);
597       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat);
598       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat);
599       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat);
600       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat);
601 
602       /* Create the solve analysis information */
603       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSP(stat);
604 
605       /* set the operation */
606       loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
607 
608       /* set the matrix */
609       loTriFactor->csrMat = new CsrMatrix;
610       loTriFactor->csrMat->num_rows = A->rmap->n;
611       loTriFactor->csrMat->num_cols = A->cmap->n;
612       loTriFactor->csrMat->num_entries = a->nz;
613 
614       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
615       loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
616 
617       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
618       loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
619 
620       loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
621       loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
622 
623       /* perform the solve analysis */
624       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
625                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
626                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
627                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSP(stat);
628 
629       /* assign the pointer. Is this really necessary? */
630       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
631 
632       A->valid_GPU_matrix = PETSC_CUSP_BOTH;
633       ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr);
634       ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr);
635       ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr);
636       ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr);
637     } catch(char *ex) {
638       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
639     }
640   }
641   PetscFunctionReturn(0);
642 }
643 
644 #undef __FUNCT__
645 #define __FUNCT__ "MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU"
646 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
647 {
648   PetscErrorCode               ierr;
649   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
650   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
651   IS                           ip = a->row;
652   const PetscInt               *rip;
653   PetscBool                    perm_identity;
654   PetscInt                     n = A->rmap->n;
655 
656   PetscFunctionBegin;
657   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
658   cusparseTriFactors->workVector = new THRUSTARRAY;
659   cusparseTriFactors->workVector->resize(n);
660   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
661 
662   /*lower triangular indices */
663   ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
664   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
665   if (!perm_identity) {
666     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
667     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
668     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
669     cusparseTriFactors->cpermIndices->assign(rip, rip+n);
670   }
671   ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
672   PetscFunctionReturn(0);
673 }
674 
675 #undef __FUNCT__
676 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJCUSPARSE"
677 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
678 {
679   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
680   IS             isrow = b->row,iscol = b->col;
681   PetscBool      row_identity,col_identity;
682   PetscErrorCode ierr;
683 
684   PetscFunctionBegin;
685   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
686   /* determine which version of MatSolve needs to be used. */
687   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
688   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
689   if (row_identity && col_identity) {
690     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
691     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
692   } else {
693     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
694     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
695   }
696 
697   /* get the triangular factors */
698   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
699   PetscFunctionReturn(0);
700 }
701 
702 #undef __FUNCT__
703 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqAIJCUSPARSE"
704 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
705 {
706   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
707   IS             ip = b->row;
708   PetscBool      perm_identity;
709   PetscErrorCode ierr;
710 
711   PetscFunctionBegin;
712   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
713 
714   /* determine which version of MatSolve needs to be used. */
715   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
716   if (perm_identity) {
717     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
718     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
719   } else {
720     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
721     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
722   }
723 
724   /* get the triangular factors */
725   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
726   PetscFunctionReturn(0);
727 }
728 
729 #undef __FUNCT__
730 #define __FUNCT__ "MatSeqAIJCUSPARSEAnalyzeTransposeForSolve"
731 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
732 {
733   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
734   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
735   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
736   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
737   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
738   cusparseStatus_t                  stat;
739   cusparseIndexBase_t               indexBase;
740   cusparseMatrixType_t              matrixType;
741   cusparseFillMode_t                fillMode;
742   cusparseDiagType_t                diagType;
743 
744   PetscFunctionBegin;
745 
746   /*********************************************/
747   /* Now the Transpose of the Lower Tri Factor */
748   /*********************************************/
749 
750   /* allocate space for the transpose of the lower triangular factor */
751   loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
752 
753   /* set the matrix descriptors of the lower triangular factor */
754   matrixType = cusparseGetMatType(loTriFactor->descr);
755   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
756   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
757     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
758   diagType = cusparseGetMatDiagType(loTriFactor->descr);
759 
760   /* Create the matrix description */
761   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSP(stat);
762   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSP(stat);
763   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSP(stat);
764   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSP(stat);
765   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSP(stat);
766 
767   /* Create the solve analysis information */
768   stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUSP(stat);
769 
770   /* set the operation */
771   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
772 
773   /* allocate GPU space for the CSC of the lower triangular factor*/
774   loTriFactorT->csrMat = new CsrMatrix;
775   loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows;
776   loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols;
777   loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
778   loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1);
779   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries);
780   loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries);
781 
782   /* compute the transpose of the lower triangular factor, i.e. the CSC */
783   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
784                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
785                           loTriFactor->csrMat->values->data().get(),
786                           loTriFactor->csrMat->row_offsets->data().get(),
787                           loTriFactor->csrMat->column_indices->data().get(),
788                           loTriFactorT->csrMat->values->data().get(),
789                           loTriFactorT->csrMat->column_indices->data().get(),
790                           loTriFactorT->csrMat->row_offsets->data().get(),
791                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat);
792 
793   /* perform the solve analysis on the transposed matrix */
794   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
795                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries,
796                            loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
797                            loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(),
798                            loTriFactorT->solveInfo);CHKERRCUSP(stat);
799 
800   /* assign the pointer. Is this really necessary? */
801   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
802 
803   /*********************************************/
804   /* Now the Transpose of the Upper Tri Factor */
805   /*********************************************/
806 
807   /* allocate space for the transpose of the upper triangular factor */
808   upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
809 
810   /* set the matrix descriptors of the upper triangular factor */
811   matrixType = cusparseGetMatType(upTriFactor->descr);
812   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
813   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
814     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
815   diagType = cusparseGetMatDiagType(upTriFactor->descr);
816 
817   /* Create the matrix description */
818   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSP(stat);
819   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSP(stat);
820   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSP(stat);
821   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSP(stat);
822   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSP(stat);
823 
824   /* Create the solve analysis information */
825   stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUSP(stat);
826 
827   /* set the operation */
828   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
829 
830   /* allocate GPU space for the CSC of the upper triangular factor*/
831   upTriFactorT->csrMat = new CsrMatrix;
832   upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows;
833   upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols;
834   upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
835   upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1);
836   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries);
837   upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries);
838 
839   /* compute the transpose of the upper triangular factor, i.e. the CSC */
840   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
841                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
842                           upTriFactor->csrMat->values->data().get(),
843                           upTriFactor->csrMat->row_offsets->data().get(),
844                           upTriFactor->csrMat->column_indices->data().get(),
845                           upTriFactorT->csrMat->values->data().get(),
846                           upTriFactorT->csrMat->column_indices->data().get(),
847                           upTriFactorT->csrMat->row_offsets->data().get(),
848                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat);
849 
850   /* perform the solve analysis on the transposed matrix */
851   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
852                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries,
853                            upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
854                            upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(),
855                            upTriFactorT->solveInfo);CHKERRCUSP(stat);
856 
857   /* assign the pointer. Is this really necessary? */
858   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
859   PetscFunctionReturn(0);
860 }
861 
862 #undef __FUNCT__
863 #define __FUNCT__ "MatSeqAIJCUSPARSEGenerateTransposeForMult"
864 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
865 {
866   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
867   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
868   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
869   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
870   cusparseStatus_t             stat;
871   cusparseIndexBase_t          indexBase;
872   cudaError_t                  err;
873 
874   PetscFunctionBegin;
875 
876   /* allocate space for the triangular factor information */
877   matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
878   stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSP(stat);
879   indexBase = cusparseGetMatIndexBase(matstruct->descr);
880   stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSP(stat);
881   stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSP(stat);
882 
883   /* set alpha and beta */
884   err = cudaMalloc((void **)&(matstructT->alpha),sizeof(PetscScalar));CHKERRCUSP(err);
885   err = cudaMemcpy(matstructT->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err);
886   err = cudaMalloc((void **)&(matstructT->beta),sizeof(PetscScalar));CHKERRCUSP(err);
887   err = cudaMemcpy(matstructT->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err);
888   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSP(stat);
889 
890   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
891     CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
892     CsrMatrix *matrixT= new CsrMatrix;
893     matrixT->num_rows = A->rmap->n;
894     matrixT->num_cols = A->cmap->n;
895     matrixT->num_entries = a->nz;
896     matrixT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
897     matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
898     matrixT->values = new THRUSTARRAY(a->nz);
899 
900     /* compute the transpose of the upper triangular factor, i.e. the CSC */
901     indexBase = cusparseGetMatIndexBase(matstruct->descr);
902     stat = cusparse_csr2csc(cusparsestruct->handle, matrix->num_rows,
903                             matrix->num_cols, matrix->num_entries,
904                             matrix->values->data().get(),
905                             matrix->row_offsets->data().get(),
906                             matrix->column_indices->data().get(),
907                             matrixT->values->data().get(),
908                             matrixT->column_indices->data().get(),
909                             matrixT->row_offsets->data().get(),
910                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat);
911 
912     /* assign the pointer */
913     matstructT->mat = matrixT;
914 
915   } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
916 #if CUDA_VERSION>=5000
917     /* First convert HYB to CSR */
918     CsrMatrix *temp= new CsrMatrix;
919     temp->num_rows = A->rmap->n;
920     temp->num_cols = A->cmap->n;
921     temp->num_entries = a->nz;
922     temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
923     temp->column_indices = new THRUSTINTARRAY32(a->nz);
924     temp->values = new THRUSTARRAY(a->nz);
925 
926 
927     stat = cusparse_hyb2csr(cusparsestruct->handle,
928                             matstruct->descr, (cusparseHybMat_t)matstruct->mat,
929                             temp->values->data().get(),
930                             temp->row_offsets->data().get(),
931                             temp->column_indices->data().get());CHKERRCUSP(stat);
932 
933     /* Next, convert CSR to CSC (i.e. the matrix transpose) */
934     CsrMatrix *tempT= new CsrMatrix;
935     tempT->num_rows = A->rmap->n;
936     tempT->num_cols = A->cmap->n;
937     tempT->num_entries = a->nz;
938     tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
939     tempT->column_indices = new THRUSTINTARRAY32(a->nz);
940     tempT->values = new THRUSTARRAY(a->nz);
941 
942     stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
943                             temp->num_cols, temp->num_entries,
944                             temp->values->data().get(),
945                             temp->row_offsets->data().get(),
946                             temp->column_indices->data().get(),
947                             tempT->values->data().get(),
948                             tempT->column_indices->data().get(),
949                             tempT->row_offsets->data().get(),
950                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat);
951 
952     /* Last, convert CSC to HYB */
953     cusparseHybMat_t hybMat;
954     stat = cusparseCreateHybMat(&hybMat);CHKERRCUSP(stat);
955     cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
956       CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
957     stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
958                             matstructT->descr, tempT->values->data().get(),
959                             tempT->row_offsets->data().get(),
960                             tempT->column_indices->data().get(),
961                             hybMat, 0, partition);CHKERRCUSP(stat);
962 
963     /* assign the pointer */
964     matstructT->mat = hybMat;
965 
966     /* delete temporaries */
967     if (tempT) {
968       if (tempT->values) delete (THRUSTARRAY*) tempT->values;
969       if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
970       if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
971       delete (CsrMatrix*) tempT;
972     }
973     if (temp) {
974       if (temp->values) delete (THRUSTARRAY*) temp->values;
975       if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
976       if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
977       delete (CsrMatrix*) temp;
978     }
979 #else
980     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format for the Matrix Transpose (in MatMultTranspose) require CUDA 5.0 or later.");
981 #endif
982   }
983   /* assign the compressed row indices */
984   matstructT->cprowIndices = new THRUSTINTARRAY;
985 
986   /* assign the pointer */
987   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
988   PetscFunctionReturn(0);
989 }
990 
991 #undef __FUNCT__
992 #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE"
993 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
994 {
995   CUSPARRAY                         *xGPU, *bGPU;
996   cusparseStatus_t                  stat;
997   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
998   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
999   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1000   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1001   PetscErrorCode                    ierr;
1002 
1003   PetscFunctionBegin;
1004   /* Analyze the matrix and create the transpose ... on the fly */
1005   if (!loTriFactorT && !upTriFactorT) {
1006     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1007     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1008     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1009   }
1010 
1011   /* Get the GPU pointers */
1012   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1013   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
1014 
1015   /* First, reorder with the row permutation */
1016   thrust::copy(thrust::make_permutation_iterator(bGPU->begin(), cusparseTriFactors->rpermIndices->begin()),
1017                thrust::make_permutation_iterator(bGPU->end(), cusparseTriFactors->rpermIndices->end()),
1018                xGPU->begin());
1019 
1020   /* First, solve U */
1021   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1022                         upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr,
1023                         upTriFactorT->csrMat->values->data().get(),
1024                         upTriFactorT->csrMat->row_offsets->data().get(),
1025                         upTriFactorT->csrMat->column_indices->data().get(),
1026                         upTriFactorT->solveInfo,
1027                         xGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat);
1028 
1029   /* Then, solve L */
1030   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1031                         loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr,
1032                         loTriFactorT->csrMat->values->data().get(),
1033                         loTriFactorT->csrMat->row_offsets->data().get(),
1034                         loTriFactorT->csrMat->column_indices->data().get(),
1035                         loTriFactorT->solveInfo,
1036                         tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat);
1037 
1038   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1039   thrust::copy(thrust::make_permutation_iterator(xGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1040                thrust::make_permutation_iterator(xGPU->end(), cusparseTriFactors->cpermIndices->end()),
1041                tempGPU->begin());
1042 
1043   /* Copy the temporary to the full solution. */
1044   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU->begin());
1045 
1046   /* restore */
1047   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
1048   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1049   ierr = WaitForGPU();CHKERRCUSP(ierr);
1050 
1051   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1052   PetscFunctionReturn(0);
1053 }
1054 
1055 #undef __FUNCT__
1056 #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering"
1057 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1058 {
1059   CUSPARRAY                         *xGPU,*bGPU;
1060   cusparseStatus_t                  stat;
1061   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1062   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1063   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1064   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1065   PetscErrorCode                    ierr;
1066 
1067   PetscFunctionBegin;
1068   /* Analyze the matrix and create the transpose ... on the fly */
1069   if (!loTriFactorT && !upTriFactorT) {
1070     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1071     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1072     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1073   }
1074 
1075   /* Get the GPU pointers */
1076   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1077   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
1078 
1079   /* First, solve U */
1080   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1081                         upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr,
1082                         upTriFactorT->csrMat->values->data().get(),
1083                         upTriFactorT->csrMat->row_offsets->data().get(),
1084                         upTriFactorT->csrMat->column_indices->data().get(),
1085                         upTriFactorT->solveInfo,
1086                         bGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat);
1087 
1088   /* Then, solve L */
1089   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1090                         loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr,
1091                         loTriFactorT->csrMat->values->data().get(),
1092                         loTriFactorT->csrMat->row_offsets->data().get(),
1093                         loTriFactorT->csrMat->column_indices->data().get(),
1094                         loTriFactorT->solveInfo,
1095                         tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat);
1096 
1097   /* restore */
1098   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
1099   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1100   ierr = WaitForGPU();CHKERRCUSP(ierr);
1101   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1102   PetscFunctionReturn(0);
1103 }
1104 
1105 #undef __FUNCT__
1106 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE"
1107 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1108 {
1109   CUSPARRAY                         *xGPU,*bGPU;
1110   cusparseStatus_t                  stat;
1111   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1112   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1113   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1114   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1115   PetscErrorCode                    ierr;
1116   VecType                           t;
1117   PetscBool                         flg;
1118 
1119   PetscFunctionBegin;
1120   ierr = VecGetType(bb,&t);CHKERRQ(ierr);
1121   ierr = PetscStrcmp(t,VECSEQCUSP,&flg);CHKERRQ(ierr);
1122   if (!flg) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Vector of type %s passed into MatSolve_SeqAIJCUSPARSE (Arg #2). Can only deal with %s\n.",t,VECSEQCUSP);
1123   ierr = VecGetType(xx,&t);CHKERRQ(ierr);
1124   ierr = PetscStrcmp(t,VECSEQCUSP,&flg);CHKERRQ(ierr);
1125   if (!flg) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Vector of type %s passed into MatSolve_SeqAIJCUSPARSE (Arg #3). Can only deal with %s\n.",t,VECSEQCUSP);
1126 
1127   /* Get the GPU pointers */
1128   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1129   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
1130 
1131   /* First, reorder with the row permutation */
1132   thrust::copy(thrust::make_permutation_iterator(bGPU->begin(), cusparseTriFactors->rpermIndices->begin()),
1133                thrust::make_permutation_iterator(bGPU->end(), cusparseTriFactors->rpermIndices->end()),
1134                xGPU->begin());
1135 
1136   /* Next, solve L */
1137   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1138                         loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr,
1139                         loTriFactor->csrMat->values->data().get(),
1140                         loTriFactor->csrMat->row_offsets->data().get(),
1141                         loTriFactor->csrMat->column_indices->data().get(),
1142                         loTriFactor->solveInfo,
1143                         xGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat);
1144 
1145   /* Then, solve U */
1146   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1147                         upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr,
1148                         upTriFactor->csrMat->values->data().get(),
1149                         upTriFactor->csrMat->row_offsets->data().get(),
1150                         upTriFactor->csrMat->column_indices->data().get(),
1151                         upTriFactor->solveInfo,
1152                         tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat);
1153 
1154   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1155   thrust::copy(thrust::make_permutation_iterator(xGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1156                thrust::make_permutation_iterator(xGPU->end(), cusparseTriFactors->cpermIndices->end()),
1157                tempGPU->begin());
1158 
1159   /* Copy the temporary to the full solution. */
1160   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU->begin());
1161 
1162   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
1163   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1164   ierr = WaitForGPU();CHKERRCUSP(ierr);
1165   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1166   PetscFunctionReturn(0);
1167 }
1168 
1169 #undef __FUNCT__
1170 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE_NaturalOrdering"
1171 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1172 {
1173   CUSPARRAY                         *xGPU,*bGPU;
1174   cusparseStatus_t                  stat;
1175   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1176   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1177   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1178   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1179   PetscErrorCode                    ierr;
1180 
1181   PetscFunctionBegin;
1182   /* Get the GPU pointers */
1183   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1184   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
1185 
1186   /* First, solve L */
1187   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1188                         loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr,
1189                         loTriFactor->csrMat->values->data().get(),
1190                         loTriFactor->csrMat->row_offsets->data().get(),
1191                         loTriFactor->csrMat->column_indices->data().get(),
1192                         loTriFactor->solveInfo,
1193                         bGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat);
1194 
1195   /* Next, solve U */
1196   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1197                         upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr,
1198                         upTriFactor->csrMat->values->data().get(),
1199                         upTriFactor->csrMat->row_offsets->data().get(),
1200                         upTriFactor->csrMat->column_indices->data().get(),
1201                         upTriFactor->solveInfo,
1202                         tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat);
1203 
1204   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
1205   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1206   ierr = WaitForGPU();CHKERRCUSP(ierr);
1207   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1208   PetscFunctionReturn(0);
1209 }
1210 
1211 #undef __FUNCT__
1212 #define __FUNCT__ "MatSeqAIJCUSPARSECopyToGPU"
1213 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1214 {
1215 
1216   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1217   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1218   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1219   PetscInt                     m = A->rmap->n,*ii,*ridx;
1220   PetscErrorCode               ierr;
1221   cusparseStatus_t             stat;
1222   cudaError_t                  err;
1223 
1224   PetscFunctionBegin;
1225   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
1226     ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1227     if (matstruct) {
1228       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Data structure should not be initialized.");
1229     }
1230     try {
1231       cusparsestruct->nonzerorow=0;
1232       for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0);
1233 
1234       if (a->compressedrow.use) {
1235         m    = a->compressedrow.nrows;
1236         ii   = a->compressedrow.i;
1237         ridx = a->compressedrow.rindex;
1238       } else {
1239         /* Forcing compressed row on the GPU */
1240         int k=0;
1241         ierr = PetscMalloc1((cusparsestruct->nonzerorow+1), &ii);CHKERRQ(ierr);
1242         ierr = PetscMalloc1((cusparsestruct->nonzerorow), &ridx);CHKERRQ(ierr);
1243         ii[0]=0;
1244         for (int j = 0; j<m; j++) {
1245           if ((a->i[j+1]-a->i[j])>0) {
1246             ii[k]  = a->i[j];
1247             ridx[k]= j;
1248             k++;
1249           }
1250         }
1251         ii[cusparsestruct->nonzerorow] = a->nz;
1252         m = cusparsestruct->nonzerorow;
1253       }
1254 
1255       /* allocate space for the triangular factor information */
1256       matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1257       stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSP(stat);
1258       stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat);
1259       stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSP(stat);
1260 
1261       err = cudaMalloc((void **)&(matstruct->alpha),sizeof(PetscScalar));CHKERRCUSP(err);
1262       err = cudaMemcpy(matstruct->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err);
1263       err = cudaMalloc((void **)&(matstruct->beta),sizeof(PetscScalar));CHKERRCUSP(err);
1264       err = cudaMemcpy(matstruct->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err);
1265       stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSP(stat);
1266 
1267       /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1268       if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1269         /* set the matrix */
1270 	CsrMatrix *matrix= new CsrMatrix;
1271 	matrix->num_rows = m;
1272 	matrix->num_cols = A->cmap->n;
1273 	matrix->num_entries = a->nz;
1274 	matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1275 	matrix->row_offsets->assign(ii, ii + m+1);
1276 
1277 	matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1278 	matrix->column_indices->assign(a->j, a->j+a->nz);
1279 
1280 	matrix->values = new THRUSTARRAY(a->nz);
1281 	matrix->values->assign(a->a, a->a+a->nz);
1282 
1283         /* assign the pointer */
1284         matstruct->mat = matrix;
1285 
1286       } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1287 #if CUDA_VERSION>=4020
1288 	CsrMatrix *matrix= new CsrMatrix;
1289 	matrix->num_rows = m;
1290 	matrix->num_cols = A->cmap->n;
1291 	matrix->num_entries = a->nz;
1292 	matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1293 	matrix->row_offsets->assign(ii, ii + m+1);
1294 
1295 	matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1296 	matrix->column_indices->assign(a->j, a->j+a->nz);
1297 
1298 	matrix->values = new THRUSTARRAY(a->nz);
1299 	matrix->values->assign(a->a, a->a+a->nz);
1300 
1301         cusparseHybMat_t hybMat;
1302         stat = cusparseCreateHybMat(&hybMat);CHKERRCUSP(stat);
1303         cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1304           CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1305         stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols,
1306                                 matstruct->descr, matrix->values->data().get(),
1307                                 matrix->row_offsets->data().get(),
1308                                 matrix->column_indices->data().get(),
1309                                 hybMat, 0, partition);CHKERRCUSP(stat);
1310         /* assign the pointer */
1311         matstruct->mat = hybMat;
1312 
1313 	if (matrix) {
1314 	  if (matrix->values) delete (THRUSTARRAY*)matrix->values;
1315 	  if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices;
1316 	  if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets;
1317 	  delete (CsrMatrix*)matrix;
1318 	}
1319 #endif
1320       }
1321 
1322       /* assign the compressed row indices */
1323       matstruct->cprowIndices = new THRUSTINTARRAY(m);
1324       matstruct->cprowIndices->assign(ridx,ridx+m);
1325 
1326       /* assign the pointer */
1327       cusparsestruct->mat = matstruct;
1328 
1329       if (!a->compressedrow.use) {
1330         ierr = PetscFree(ii);CHKERRQ(ierr);
1331         ierr = PetscFree(ridx);CHKERRQ(ierr);
1332       }
1333       cusparsestruct->workVector = new THRUSTARRAY;
1334       cusparsestruct->workVector->resize(m);
1335     } catch(char *ex) {
1336       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1337     }
1338     ierr = WaitForGPU();CHKERRCUSP(ierr);
1339 
1340     A->valid_GPU_matrix = PETSC_CUSP_BOTH;
1341 
1342     ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1343   }
1344   PetscFunctionReturn(0);
1345 }
1346 
1347 #undef __FUNCT__
1348 #define __FUNCT__ "MatCreateVecs_SeqAIJCUSPARSE"
1349 static PetscErrorCode MatCreateVecs_SeqAIJCUSPARSE(Mat mat, Vec *right, Vec *left)
1350 {
1351   PetscErrorCode ierr;
1352   PetscInt rbs,cbs;
1353 
1354   PetscFunctionBegin;
1355   ierr = MatGetBlockSizes(mat,&rbs,&cbs);CHKERRQ(ierr);
1356   if (right) {
1357     ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr);
1358     ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
1359     ierr = VecSetBlockSize(*right,cbs);CHKERRQ(ierr);
1360     ierr = VecSetType(*right,VECSEQCUSP);CHKERRQ(ierr);
1361     ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr);
1362   }
1363   if (left) {
1364     ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr);
1365     ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
1366     ierr = VecSetBlockSize(*left,rbs);CHKERRQ(ierr);
1367     ierr = VecSetType(*left,VECSEQCUSP);CHKERRQ(ierr);
1368     ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr);
1369   }
1370   PetscFunctionReturn(0);
1371 }
1372 
1373 struct VecCUSPPlusEquals
1374 {
1375   template <typename Tuple>
1376   __host__ __device__
1377   void operator()(Tuple t)
1378   {
1379     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1380   }
1381 };
1382 
1383 #undef __FUNCT__
1384 #define __FUNCT__ "MatMult_SeqAIJCUSPARSE"
1385 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1386 {
1387   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1388   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1389   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1390   CUSPARRAY                    *xarray,*yarray;
1391   PetscErrorCode               ierr;
1392   cusparseStatus_t             stat;
1393 
1394   PetscFunctionBegin;
1395   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
1396      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
1397   ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1398   ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
1399   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1400     CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1401     stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1402                              mat->num_rows, mat->num_cols, mat->num_entries,
1403                              matstruct->alpha, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(),
1404                              mat->column_indices->data().get(), xarray->data().get(), matstruct->beta,
1405                              yarray->data().get());CHKERRCUSP(stat);
1406   } else {
1407 #if CUDA_VERSION>=4020
1408     cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1409     stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1410                              matstruct->alpha, matstruct->descr, hybMat,
1411                              xarray->data().get(), matstruct->beta,
1412                              yarray->data().get());CHKERRCUSP(stat);
1413 #endif
1414   }
1415   ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1416   ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
1417   if (!cusparsestruct->stream) {
1418     ierr = WaitForGPU();CHKERRCUSP(ierr);
1419   }
1420   ierr = PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr);
1421   PetscFunctionReturn(0);
1422 }
1423 
1424 #undef __FUNCT__
1425 #define __FUNCT__ "MatMultTranspose_SeqAIJCUSPARSE"
1426 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1427 {
1428   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1429   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1430   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1431   CUSPARRAY                    *xarray,*yarray;
1432   PetscErrorCode               ierr;
1433   cusparseStatus_t             stat;
1434 
1435   PetscFunctionBegin;
1436   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
1437      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
1438   if (!matstructT) {
1439     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1440     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1441   }
1442   ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1443   ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
1444 
1445   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1446     CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1447     stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1448                              mat->num_rows, mat->num_cols,
1449                              mat->num_entries, matstructT->alpha, matstructT->descr,
1450                              mat->values->data().get(), mat->row_offsets->data().get(),
1451                              mat->column_indices->data().get(), xarray->data().get(), matstructT->beta,
1452                              yarray->data().get());CHKERRCUSP(stat);
1453   } else {
1454 #if CUDA_VERSION>=4020
1455     cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1456     stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1457                              matstructT->alpha, matstructT->descr, hybMat,
1458                              xarray->data().get(), matstructT->beta,
1459                              yarray->data().get());CHKERRCUSP(stat);
1460 #endif
1461   }
1462   ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1463   ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
1464   if (!cusparsestruct->stream) {
1465     ierr = WaitForGPU();CHKERRCUSP(ierr);
1466   }
1467   ierr = PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr);
1468   PetscFunctionReturn(0);
1469 }
1470 
1471 
1472 #undef __FUNCT__
1473 #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE"
1474 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1475 {
1476   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1477   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1478   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1479   CUSPARRAY                    *xarray,*yarray,*zarray;
1480   PetscErrorCode               ierr;
1481   cusparseStatus_t             stat;
1482 
1483   PetscFunctionBegin;
1484   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
1485      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
1486   try {
1487     ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr);
1488     ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1489     ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr);
1490     ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
1491 
1492     /* multiply add */
1493     if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1494       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1495       /* here we need to be careful to set the number of rows in the multiply to the
1496 	 number of compressed rows in the matrix ... which is equivalent to the
1497 	 size of the workVector */
1498       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1499                                mat->num_rows, mat->num_cols,
1500                                mat->num_entries, matstruct->alpha, matstruct->descr,
1501                                mat->values->data().get(), mat->row_offsets->data().get(),
1502                                mat->column_indices->data().get(), xarray->data().get(), matstruct->beta,
1503                                cusparsestruct->workVector->data().get());CHKERRCUSP(stat);
1504     } else {
1505 #if CUDA_VERSION>=4020
1506       cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1507       if (cusparsestruct->workVector->size()) {
1508 	stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1509 				 matstruct->alpha, matstruct->descr, hybMat,
1510 				 xarray->data().get(), matstruct->beta,
1511 				 cusparsestruct->workVector->data().get());CHKERRCUSP(stat);
1512       }
1513 #endif
1514     }
1515 
1516     /* scatter the data from the temporary into the full vector with a += operation */
1517     thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstruct->cprowIndices->begin()))),
1518 		     thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(),
1519 		     VecCUSPPlusEquals());
1520     ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1521     ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr);
1522     ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
1523 
1524   } catch(char *ex) {
1525     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1526   }
1527   ierr = WaitForGPU();CHKERRCUSP(ierr);
1528   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1529   PetscFunctionReturn(0);
1530 }
1531 
1532 #undef __FUNCT__
1533 #define __FUNCT__ "MatMultTransposeAdd_SeqAIJCUSPARSE"
1534 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1535 {
1536   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1537   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1538   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1539   CUSPARRAY                    *xarray,*yarray,*zarray;
1540   PetscErrorCode               ierr;
1541   cusparseStatus_t             stat;
1542 
1543   PetscFunctionBegin;
1544   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
1545      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
1546   if (!matstructT) {
1547     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1548     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1549   }
1550 
1551   try {
1552     ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr);
1553     ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1554     ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr);
1555     ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
1556 
1557     /* multiply add with matrix transpose */
1558     if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1559       CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1560       /* here we need to be careful to set the number of rows in the multiply to the
1561 	 number of compressed rows in the matrix ... which is equivalent to the
1562 	 size of the workVector */
1563       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1564                                mat->num_rows, mat->num_cols,
1565 			       mat->num_entries, matstructT->alpha, matstructT->descr,
1566                                mat->values->data().get(), mat->row_offsets->data().get(),
1567                                mat->column_indices->data().get(), xarray->data().get(), matstructT->beta,
1568                                cusparsestruct->workVector->data().get());CHKERRCUSP(stat);
1569     } else {
1570 #if CUDA_VERSION>=4020
1571       cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1572       if (cusparsestruct->workVector->size()) {
1573 	stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1574 				 matstructT->alpha, matstructT->descr, hybMat,
1575 				 xarray->data().get(), matstructT->beta,
1576 				 cusparsestruct->workVector->data().get());CHKERRCUSP(stat);
1577       }
1578 #endif
1579     }
1580 
1581     /* scatter the data from the temporary into the full vector with a += operation */
1582     thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstructT->cprowIndices->begin()))),
1583 		     thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstructT->cprowIndices->begin()))) + cusparsestruct->workVector->size(),
1584 		     VecCUSPPlusEquals());
1585 
1586     ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1587     ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr);
1588     ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
1589 
1590   } catch(char *ex) {
1591     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1592   }
1593   ierr = WaitForGPU();CHKERRCUSP(ierr);
1594   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1595   PetscFunctionReturn(0);
1596 }
1597 
1598 #undef __FUNCT__
1599 #define __FUNCT__ "MatAssemblyEnd_SeqAIJCUSPARSE"
1600 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
1601 {
1602   PetscErrorCode ierr;
1603 
1604   PetscFunctionBegin;
1605   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
1606   if (A->factortype==MAT_FACTOR_NONE) {
1607     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1608   }
1609   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
1610   A->ops->mult             = MatMult_SeqAIJCUSPARSE;
1611   A->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1612   A->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1613   A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1614   PetscFunctionReturn(0);
1615 }
1616 
1617 /* --------------------------------------------------------------------------------*/
1618 #undef __FUNCT__
1619 #define __FUNCT__ "MatCreateSeqAIJCUSPARSE"
1620 /*@
1621    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
1622    (the default parallel PETSc format). This matrix will ultimately pushed down
1623    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
1624    assembly performance the user should preallocate the matrix storage by setting
1625    the parameter nz (or the array nnz).  By setting these parameters accurately,
1626    performance during matrix assembly can be increased by more than a factor of 50.
1627 
1628    Collective on MPI_Comm
1629 
1630    Input Parameters:
1631 +  comm - MPI communicator, set to PETSC_COMM_SELF
1632 .  m - number of rows
1633 .  n - number of columns
1634 .  nz - number of nonzeros per row (same for all rows)
1635 -  nnz - array containing the number of nonzeros in the various rows
1636          (possibly different for each row) or NULL
1637 
1638    Output Parameter:
1639 .  A - the matrix
1640 
1641    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1642    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1643    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
1644 
1645    Notes:
1646    If nnz is given then nz is ignored
1647 
1648    The AIJ format (also called the Yale sparse matrix format or
1649    compressed row storage), is fully compatible with standard Fortran 77
1650    storage.  That is, the stored row and column indices can begin at
1651    either one (as in Fortran) or zero.  See the users' manual for details.
1652 
1653    Specify the preallocated storage with either nz or nnz (not both).
1654    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1655    allocation.  For large problems you MUST preallocate memory or you
1656    will get TERRIBLE performance, see the users' manual chapter on matrices.
1657 
1658    By default, this format uses inodes (identical nodes) when possible, to
1659    improve numerical efficiency of matrix-vector products and solves. We
1660    search for consecutive rows with the same nonzero structure, thereby
1661    reusing matrix information to achieve increased efficiency.
1662 
1663    Level: intermediate
1664 
1665 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
1666 @*/
1667 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1668 {
1669   PetscErrorCode ierr;
1670 
1671   PetscFunctionBegin;
1672   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1673   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
1674   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1675   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
1676   PetscFunctionReturn(0);
1677 }
1678 
1679 #undef __FUNCT__
1680 #define __FUNCT__ "MatDestroy_SeqAIJCUSPARSE"
1681 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
1682 {
1683   PetscErrorCode   ierr;
1684   cusparseStatus_t stat;
1685   cudaError_t      err;
1686   PetscFunctionBegin;
1687   if (A->factortype==MAT_FACTOR_NONE) {
1688     try {
1689       if (A->valid_GPU_matrix != PETSC_CUSP_UNALLOCATED) {
1690         Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1691         Mat_SeqAIJCUSPARSEMultStruct *matstruct  = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1692         Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1693 
1694         /* delete all the members in each of the data structures */
1695         if (matstruct) {
1696           if (matstruct->descr) { stat = cusparseDestroyMatDescr(matstruct->descr);CHKERRCUSP(stat); }
1697           if (matstruct->mat) {
1698             if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1699 #if CUDA_VERSION>=4020
1700               cusparseHybMat_t hybMat = (cusparseHybMat_t) matstruct->mat;
1701               stat = cusparseDestroyHybMat(hybMat);CHKERRCUSP(stat);
1702 #endif
1703             } else {
1704 	      CsrMatrix* mat = (CsrMatrix*)matstruct->mat;
1705 	      if (mat->values) delete (THRUSTARRAY*)mat->values;
1706 	      if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1707 	      if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1708               delete (CsrMatrix*)matstruct->mat;
1709             }
1710           }
1711 	  if (matstruct->alpha) { err=cudaFree(matstruct->alpha);CHKERRCUSP(err); }
1712 	  if (matstruct->beta) { err=cudaFree(matstruct->beta);CHKERRCUSP(err); }
1713           if (matstruct->cprowIndices) delete (THRUSTINTARRAY*)matstruct->cprowIndices;
1714           if (matstruct) delete (Mat_SeqAIJCUSPARSEMultStruct*)matstruct;
1715         }
1716         if (matstructT) {
1717           if (matstructT->descr) { stat = cusparseDestroyMatDescr(matstructT->descr);CHKERRCUSP(stat); }
1718           if (matstructT->mat) {
1719             if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1720 #if CUDA_VERSION>=4020
1721               cusparseHybMat_t hybMat = (cusparseHybMat_t) matstructT->mat;
1722               stat = cusparseDestroyHybMat(hybMat);CHKERRCUSP(stat);
1723 #endif
1724             } else {
1725 	      CsrMatrix* matT = (CsrMatrix*)matstructT->mat;
1726 	      if (matT->values) delete (THRUSTARRAY*)matT->values;
1727 	      if (matT->column_indices) delete (THRUSTINTARRAY32*)matT->column_indices;
1728 	      if (matT->row_offsets) delete (THRUSTINTARRAY32*)matT->row_offsets;
1729               delete (CsrMatrix*)matstructT->mat;
1730             }
1731           }
1732 	  if (matstructT->alpha) { err=cudaFree(matstructT->alpha);CHKERRCUSP(err); }
1733 	  if (matstructT->beta) { err=cudaFree(matstructT->beta);CHKERRCUSP(err); }
1734           if (matstructT->cprowIndices) delete (THRUSTINTARRAY*)matstructT->cprowIndices;
1735           if (matstructT) delete (Mat_SeqAIJCUSPARSEMultStruct*)matstructT;
1736         }
1737         if (cusparsestruct->workVector) delete (THRUSTARRAY*)cusparsestruct->workVector;
1738         if (cusparsestruct->handle) { stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSP(stat); }
1739         if (cusparsestruct) delete (Mat_SeqAIJCUSPARSE*)cusparsestruct;
1740         A->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED;
1741       } else {
1742         Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1743         if (cusparsestruct->handle) { stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSP(stat); }
1744         if (cusparsestruct) delete (Mat_SeqAIJCUSPARSE*)cusparsestruct;
1745       }
1746     } catch(char *ex) {
1747       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1748     }
1749   } else {
1750     /* The triangular factors */
1751     try {
1752       Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1753       Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1754       Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1755       Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1756       Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1757 
1758       /* delete all the members in each of the data structures */
1759       if (loTriFactor) {
1760         if (loTriFactor->descr) { stat = cusparseDestroyMatDescr(loTriFactor->descr);CHKERRCUSP(stat); }
1761         if (loTriFactor->solveInfo) { stat = cusparseDestroySolveAnalysisInfo(loTriFactor->solveInfo);CHKERRCUSP(stat); }
1762         if (loTriFactor->csrMat) {
1763 	  if (loTriFactor->csrMat->values) delete (THRUSTARRAY*)loTriFactor->csrMat->values;
1764 	  if (loTriFactor->csrMat->column_indices) delete (THRUSTINTARRAY32*)loTriFactor->csrMat->column_indices;
1765 	  if (loTriFactor->csrMat->row_offsets) delete (THRUSTINTARRAY32*)loTriFactor->csrMat->row_offsets;
1766 	  delete (CsrMatrix*)loTriFactor->csrMat;
1767 	}
1768         if (loTriFactor) delete (Mat_SeqAIJCUSPARSETriFactorStruct*)loTriFactor;
1769       }
1770 
1771       if (upTriFactor) {
1772         if (upTriFactor->descr) { stat = cusparseDestroyMatDescr(upTriFactor->descr);CHKERRCUSP(stat); }
1773         if (upTriFactor->solveInfo) { stat = cusparseDestroySolveAnalysisInfo(upTriFactor->solveInfo);CHKERRCUSP(stat); }
1774         if (upTriFactor->csrMat) {
1775 	  if (upTriFactor->csrMat->values) delete (THRUSTARRAY*)upTriFactor->csrMat->values;
1776 	  if (upTriFactor->csrMat->column_indices) delete (THRUSTINTARRAY32*)upTriFactor->csrMat->column_indices;
1777 	  if (upTriFactor->csrMat->row_offsets) delete (THRUSTINTARRAY32*)upTriFactor->csrMat->row_offsets;
1778 	  delete (CsrMatrix*)upTriFactor->csrMat;
1779 	}
1780         if (upTriFactor) delete (Mat_SeqAIJCUSPARSETriFactorStruct*)upTriFactor;
1781       }
1782 
1783       if (loTriFactorT) {
1784         if (loTriFactorT->descr) { stat = cusparseDestroyMatDescr(loTriFactorT->descr);CHKERRCUSP(stat); }
1785         if (loTriFactorT->solveInfo) { stat = cusparseDestroySolveAnalysisInfo(loTriFactorT->solveInfo);CHKERRCUSP(stat); }
1786         if (loTriFactorT->csrMat) {
1787 	  if (loTriFactorT->csrMat->values) delete (THRUSTARRAY*)loTriFactorT->csrMat->values;
1788 	  if (loTriFactorT->csrMat->column_indices) delete (THRUSTINTARRAY32*)loTriFactorT->csrMat->column_indices;
1789 	  if (loTriFactorT->csrMat->row_offsets) delete (THRUSTINTARRAY32*)loTriFactorT->csrMat->row_offsets;
1790 	  delete (CsrMatrix*)loTriFactorT->csrMat;
1791 	}
1792         if (loTriFactorT) delete (Mat_SeqAIJCUSPARSETriFactorStruct*)loTriFactorT;
1793       }
1794 
1795       if (upTriFactorT) {
1796         if (upTriFactorT->descr) { stat = cusparseDestroyMatDescr(upTriFactorT->descr);CHKERRCUSP(stat); }
1797         if (upTriFactorT->solveInfo) { stat = cusparseDestroySolveAnalysisInfo(upTriFactorT->solveInfo);CHKERRCUSP(stat); }
1798         if (upTriFactorT->csrMat) {
1799 	  if (upTriFactorT->csrMat->values) delete (THRUSTARRAY*)upTriFactorT->csrMat->values;
1800 	  if (upTriFactorT->csrMat->column_indices) delete (THRUSTINTARRAY32*)upTriFactorT->csrMat->column_indices;
1801 	  if (upTriFactorT->csrMat->row_offsets) delete (THRUSTINTARRAY32*)upTriFactorT->csrMat->row_offsets;
1802 	  delete (CsrMatrix*)upTriFactorT->csrMat;
1803 	}
1804         if (upTriFactorT) delete (Mat_SeqAIJCUSPARSETriFactorStruct*)upTriFactorT;
1805       }
1806       if (cusparseTriFactors->rpermIndices) delete (THRUSTINTARRAY*)cusparseTriFactors->rpermIndices;
1807       if (cusparseTriFactors->cpermIndices) delete (THRUSTINTARRAY*)cusparseTriFactors->cpermIndices;
1808       if (cusparseTriFactors->workVector) delete (THRUSTARRAY*)cusparseTriFactors->workVector;
1809       if (cusparseTriFactors->handle) { stat = cusparseDestroy(cusparseTriFactors->handle);CHKERRCUSP(stat); }
1810       if (cusparseTriFactors) delete (Mat_SeqAIJCUSPARSETriFactors*)cusparseTriFactors;
1811     } catch(char *ex) {
1812       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1813     }
1814   }
1815   /*this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */
1816   A->spptr = 0;
1817 
1818   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
1819   PetscFunctionReturn(0);
1820 }
1821 
1822 #undef __FUNCT__
1823 #define __FUNCT__ "MatCreate_SeqAIJCUSPARSE"
1824 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
1825 {
1826   PetscErrorCode ierr;
1827   cusparseStatus_t stat;
1828   cusparseHandle_t handle=0;
1829 
1830   PetscFunctionBegin;
1831   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
1832   if (B->factortype==MAT_FACTOR_NONE) {
1833     /* you cannot check the inode.use flag here since the matrix was just created.
1834        now build a GPU matrix data structure */
1835     B->spptr = new Mat_SeqAIJCUSPARSE;
1836     ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat          = 0;
1837     ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0;
1838     ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector   = 0;
1839     ((Mat_SeqAIJCUSPARSE*)B->spptr)->format       = MAT_CUSPARSE_CSR;
1840     ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream       = 0;
1841     ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle       = 0;
1842     stat = cusparseCreate(&handle);CHKERRCUSP(stat);
1843     ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle       = handle;
1844     ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream       = 0;
1845   } else {
1846     /* NEXT, set the pointers to the triangular factors */
1847     B->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1848     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr          = 0;
1849     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr          = 0;
1850     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0;
1851     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0;
1852     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices            = 0;
1853     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices            = 0;
1854     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector              = 0;
1855     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle                  = 0;
1856     stat = cusparseCreate(&handle);CHKERRCUSP(stat);
1857     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle                  = handle;
1858     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz                     = 0;
1859   }
1860 
1861   /* Here we overload MatGetFactor_petsc_C which enables -mat_type aijcusparse to use the
1862      default cusparse tri solve. Note the difference with the implementation in
1863      MatCreate_SeqAIJCUSP in ../seqcusp/aijcusp.cu */
1864   ierr = PetscObjectComposeFunction((PetscObject)B,"MatGetFactor_petsc_C",MatGetFactor_seqaij_cusparse);CHKERRQ(ierr);
1865 
1866   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
1867   B->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
1868   B->ops->getvecs          = MatCreateVecs_SeqAIJCUSPARSE;
1869   B->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
1870   B->ops->mult             = MatMult_SeqAIJCUSPARSE;
1871   B->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1872   B->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1873   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1874 
1875   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1876 
1877   B->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED;
1878 
1879   ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
1880   PetscFunctionReturn(0);
1881 }
1882 
1883 /*M
1884    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
1885 
1886    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
1887    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
1888    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
1889 
1890    Options Database Keys:
1891 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
1892 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1893 .  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1894 
1895   Level: beginner
1896 
1897 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
1898 M*/
1899