xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision ce8146529ec8918a9e13a256688e619a410abb90)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 
6 #include <petscconf.h>
7 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
8 #include <../src/mat/impls/sbaij/seq/sbaij.h>
9 #include <../src/vec/vec/impls/dvecimpl.h>
10 #include <petsc-private/vecimpl.h>
11 #undef VecType
12 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
13 
14 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
15 
16 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
17 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
18 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
19 
20 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
21 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
22 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
23 
24 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
25 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
26 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
27 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
28 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat);
29 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
30 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
31 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
32 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
33 
34 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
35 static PetscErrorCode Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
36 static PetscErrorCode Mat_SeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
37 static PetscErrorCode Mat_SeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
38 static PetscErrorCode Mat_SeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
39 
40 #undef __FUNCT__
41 #define __FUNCT__ "MatCUSPARSESetStream"
42 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
43 {
44   cusparseStatus_t   stat;
45   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
46 
47   PetscFunctionBegin;
48   cusparsestruct->stream = stream;
49   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSP(stat);
50   PetscFunctionReturn(0);
51 }
52 
53 #undef __FUNCT__
54 #define __FUNCT__ "MatCUSPARSESetHandle"
55 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
56 {
57   cusparseStatus_t   stat;
58   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
59 
60   PetscFunctionBegin;
61   if (cusparsestruct->handle)
62     stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSP(stat);
63   cusparsestruct->handle = handle;
64   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSP(stat);
65   PetscFunctionReturn(0);
66 }
67 
68 #undef __FUNCT__
69 #define __FUNCT__ "MatCUSPARSEClearHandle"
70 PetscErrorCode MatCUSPARSEClearHandle(Mat A)
71 {
72   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
73   PetscFunctionBegin;
74   if (cusparsestruct->handle)
75     cusparsestruct->handle = 0;
76   PetscFunctionReturn(0);
77 }
78 
79 #undef __FUNCT__
80 #define __FUNCT__ "MatFactorGetSolverPackage_seqaij_cusparse"
81 PetscErrorCode MatFactorGetSolverPackage_seqaij_cusparse(Mat A,const MatSolverPackage *type)
82 {
83   PetscFunctionBegin;
84   *type = MATSOLVERCUSPARSE;
85   PetscFunctionReturn(0);
86 }
87 
88 /*MC
89   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
90   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
91   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
92   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
93   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
94   algorithms are not recommended. This class does NOT support direct solver operations.
95 
96   Level: beginner
97 
98 .seealso: PCFactorSetMatSolverPackage(), MatSolverPackage, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
99 M*/
100 
101 #undef __FUNCT__
102 #define __FUNCT__ "MatGetFactor_seqaij_cusparse"
103 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaij_cusparse(Mat A,MatFactorType ftype,Mat *B)
104 {
105   PetscErrorCode ierr;
106   PetscInt       n = A->rmap->n;
107 
108   PetscFunctionBegin;
109   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
110   (*B)->factortype = ftype;
111   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
112   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
113 
114   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
115     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
116     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
117     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
118   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
119     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
120     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
121   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
122 
123   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
124   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverPackage_C",MatFactorGetSolverPackage_seqaij_cusparse);CHKERRQ(ierr);
125   PetscFunctionReturn(0);
126 }
127 
128 #undef __FUNCT__
129 #define __FUNCT__ "MatCUSPARSESetFormat_SeqAIJCUSPARSE"
130 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
131 {
132   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
133 
134   PetscFunctionBegin;
135 #if CUDA_VERSION>=4020
136   switch (op) {
137   case MAT_CUSPARSE_MULT:
138     cusparsestruct->format = format;
139     break;
140   case MAT_CUSPARSE_ALL:
141     cusparsestruct->format = format;
142     break;
143   default:
144     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
145   }
146 #else
147   if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB)
148     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format require CUDA 4.2 or later.");
149 #endif
150   PetscFunctionReturn(0);
151 }
152 
153 /*@
154    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
155    operation. Only the MatMult operation can use different GPU storage formats
156    for MPIAIJCUSPARSE matrices.
157    Not Collective
158 
159    Input Parameters:
160 +  A - Matrix of type SEQAIJCUSPARSE
161 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
162 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
163 
164    Output Parameter:
165 
166    Level: intermediate
167 
168 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
169 @*/
170 #undef __FUNCT__
171 #define __FUNCT__ "MatCUSPARSESetFormat"
172 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
173 {
174   PetscErrorCode ierr;
175 
176   PetscFunctionBegin;
177   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
178   ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
179   PetscFunctionReturn(0);
180 }
181 
182 #undef __FUNCT__
183 #define __FUNCT__ "MatSetFromOptions_SeqAIJCUSPARSE"
184 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A)
185 {
186   PetscErrorCode           ierr;
187   MatCUSPARSEStorageFormat format;
188   PetscBool                flg;
189 
190   PetscFunctionBegin;
191   ierr = PetscOptionsHead("SeqAIJCUSPARSE options");CHKERRQ(ierr);
192   ierr = PetscObjectOptionsBegin((PetscObject)A);
193   if (A->factortype==MAT_FACTOR_NONE) {
194     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
195                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
196     if (flg) {
197       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);
198     }
199   }
200   ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
201                           "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
202   if (flg) {
203     ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
204   }
205   ierr = PetscOptionsEnd();CHKERRQ(ierr);
206   PetscFunctionReturn(0);
207 
208 }
209 
210 #undef __FUNCT__
211 #define __FUNCT__ "MatILUFactorSymbolic_SeqAIJCUSPARSE"
212 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
213 {
214   PetscErrorCode ierr;
215 
216   PetscFunctionBegin;
217   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
218   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
219   PetscFunctionReturn(0);
220 }
221 
222 #undef __FUNCT__
223 #define __FUNCT__ "MatLUFactorSymbolic_SeqAIJCUSPARSE"
224 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
225 {
226   PetscErrorCode ierr;
227 
228   PetscFunctionBegin;
229   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
230   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
231   PetscFunctionReturn(0);
232 }
233 
234 #undef __FUNCT__
235 #define __FUNCT__ "MatICCFactorSymbolic_SeqAIJCUSPARSE"
236 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
237 {
238   PetscErrorCode ierr;
239 
240   PetscFunctionBegin;
241   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
242   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
243   PetscFunctionReturn(0);
244 }
245 
246 #undef __FUNCT__
247 #define __FUNCT__ "MatCholeskyFactorSymbolic_SeqAIJCUSPARSE"
248 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
249 {
250   PetscErrorCode ierr;
251 
252   PetscFunctionBegin;
253   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
254   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
255   PetscFunctionReturn(0);
256 }
257 
258 #undef __FUNCT__
259 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildILULowerTriMatrix"
260 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
261 {
262   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
263   PetscInt                          n = A->rmap->n;
264   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
265   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
266   cusparseStatus_t                  stat;
267   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
268   const MatScalar                   *aa = a->a,*v;
269   PetscInt                          *AiLo, *AjLo;
270   PetscScalar                       *AALo;
271   PetscInt                          i,nz, nzLower, offset, rowOffset;
272   PetscErrorCode                    ierr;
273 
274   PetscFunctionBegin;
275   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
276     try {
277       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
278       nzLower=n+ai[n]-ai[1];
279 
280       /* Allocate Space for the lower triangular matrix */
281       ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
282       ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUSP(ierr);
283       ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUSP(ierr);
284 
285       /* Fill the lower triangular matrix */
286       AiLo[0]  = (PetscInt) 0;
287       AiLo[n]  = nzLower;
288       AjLo[0]  = (PetscInt) 0;
289       AALo[0]  = (MatScalar) 1.0;
290       v        = aa;
291       vi       = aj;
292       offset   = 1;
293       rowOffset= 1;
294       for (i=1; i<n; i++) {
295         nz = ai[i+1] - ai[i];
296         /* additional 1 for the term on the diagonal */
297         AiLo[i]    = rowOffset;
298         rowOffset += nz+1;
299 
300         ierr = PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr);
301         ierr = PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
302 
303         offset      += nz;
304         AjLo[offset] = (PetscInt) i;
305         AALo[offset] = (MatScalar) 1.0;
306         offset      += 1;
307 
308         v  += nz;
309         vi += nz;
310       }
311 
312       /* allocate space for the triangular factor information */
313       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
314 
315       /* Create the matrix description */
316       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSP(stat);
317       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat);
318       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat);
319       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSP(stat);
320       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSP(stat);
321 
322       /* Create the solve analysis information */
323       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSP(stat);
324 
325       /* set the operation */
326       loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
327 
328       /* set the matrix */
329       loTriFactor->csrMat = new CsrMatrix;
330       loTriFactor->csrMat->num_rows = n;
331       loTriFactor->csrMat->num_cols = n;
332       loTriFactor->csrMat->num_entries = nzLower;
333 
334       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
335       loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
336 
337       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
338       loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
339 
340       loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
341       loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
342 
343       /* perform the solve analysis */
344       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
345                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
346                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
347                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSP(stat);
348 
349       /* assign the pointer. Is this really necessary? */
350       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
351 
352       ierr = cudaFreeHost(AiLo);CHKERRCUSP(ierr);
353       ierr = cudaFreeHost(AjLo);CHKERRCUSP(ierr);
354       ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr);
355     } catch(char *ex) {
356       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
357     }
358   }
359   PetscFunctionReturn(0);
360 }
361 
362 #undef __FUNCT__
363 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildILUUpperTriMatrix"
364 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
365 {
366   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
367   PetscInt                          n = A->rmap->n;
368   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
369   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
370   cusparseStatus_t                  stat;
371   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
372   const MatScalar                   *aa = a->a,*v;
373   PetscInt                          *AiUp, *AjUp;
374   PetscScalar                       *AAUp;
375   PetscInt                          i,nz, nzUpper, offset;
376   PetscErrorCode                    ierr;
377 
378   PetscFunctionBegin;
379   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
380     try {
381       /* next, figure out the number of nonzeros in the upper triangular matrix. */
382       nzUpper = adiag[0]-adiag[n];
383 
384       /* Allocate Space for the upper triangular matrix */
385       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
386       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr);
387       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr);
388 
389       /* Fill the upper triangular matrix */
390       AiUp[0]=(PetscInt) 0;
391       AiUp[n]=nzUpper;
392       offset = nzUpper;
393       for (i=n-1; i>=0; i--) {
394         v  = aa + adiag[i+1] + 1;
395         vi = aj + adiag[i+1] + 1;
396 
397         /* number of elements NOT on the diagonal */
398         nz = adiag[i] - adiag[i+1]-1;
399 
400         /* decrement the offset */
401         offset -= (nz+1);
402 
403         /* first, set the diagonal elements */
404         AjUp[offset] = (PetscInt) i;
405         AAUp[offset] = 1./v[nz];
406         AiUp[i]      = AiUp[i+1] - (nz+1);
407 
408         ierr = PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr);
409         ierr = PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
410       }
411 
412       /* allocate space for the triangular factor information */
413       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
414 
415       /* Create the matrix description */
416       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSP(stat);
417       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat);
418       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat);
419       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat);
420       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat);
421 
422       /* Create the solve analysis information */
423       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSP(stat);
424 
425       /* set the operation */
426       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
427 
428       /* set the matrix */
429       upTriFactor->csrMat = new CsrMatrix;
430       upTriFactor->csrMat->num_rows = n;
431       upTriFactor->csrMat->num_cols = n;
432       upTriFactor->csrMat->num_entries = nzUpper;
433 
434       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
435       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
436 
437       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
438       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
439 
440       upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
441       upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
442 
443       /* perform the solve analysis */
444       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
445                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
446                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
447                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSP(stat);
448 
449       /* assign the pointer. Is this really necessary? */
450       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
451 
452       ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr);
453       ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr);
454       ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr);
455     } catch(char *ex) {
456       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
457     }
458   }
459   PetscFunctionReturn(0);
460 }
461 
462 #undef __FUNCT__
463 #define __FUNCT__ "MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU"
464 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
465 {
466   PetscErrorCode               ierr;
467   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
468   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
469   IS                           isrow = a->row,iscol = a->icol;
470   PetscBool                    row_identity,col_identity;
471   const PetscInt               *r,*c;
472   PetscInt                     n = A->rmap->n;
473 
474   PetscFunctionBegin;
475   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
476   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
477 
478   cusparseTriFactors->workVector = new THRUSTARRAY;
479   cusparseTriFactors->workVector->resize(n);
480   cusparseTriFactors->nnz=a->nz;
481 
482   A->valid_GPU_matrix = PETSC_CUSP_BOTH;
483   /*lower triangular indices */
484   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
485   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
486   if (!row_identity) {
487     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
488     cusparseTriFactors->rpermIndices->assign(r, r+n);
489   }
490   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
491 
492   /*upper triangular indices */
493   ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
494   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
495   if (!col_identity) {
496     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
497     cusparseTriFactors->cpermIndices->assign(c, c+n);
498   }
499   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
500   PetscFunctionReturn(0);
501 }
502 
503 #undef __FUNCT__
504 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildICCTriMatrices"
505 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
506 {
507   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
508   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
509   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
510   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
511   cusparseStatus_t                  stat;
512   PetscErrorCode                    ierr;
513   PetscInt                          *AiUp, *AjUp;
514   PetscScalar                       *AAUp;
515   PetscScalar                       *AALo;
516   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
517   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
518   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
519   const MatScalar                   *aa = b->a,*v;
520 
521   PetscFunctionBegin;
522   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
523     try {
524       /* Allocate Space for the upper triangular matrix */
525       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
526       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr);
527       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr);
528       ierr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr);
529 
530       /* Fill the upper triangular matrix */
531       AiUp[0]=(PetscInt) 0;
532       AiUp[n]=nzUpper;
533       offset = 0;
534       for (i=0; i<n; i++) {
535         /* set the pointers */
536         v  = aa + ai[i];
537         vj = aj + ai[i];
538         nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
539 
540         /* first, set the diagonal elements */
541         AjUp[offset] = (PetscInt) i;
542         AAUp[offset] = 1.0/v[nz];
543         AiUp[i]      = offset;
544         AALo[offset] = 1.0/v[nz];
545 
546         offset+=1;
547         if (nz>0) {
548           ierr = PetscMemcpy(&(AjUp[offset]), vj, nz*sizeof(PetscInt));CHKERRQ(ierr);
549           ierr = PetscMemcpy(&(AAUp[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
550           for (j=offset; j<offset+nz; j++) {
551             AAUp[j] = -AAUp[j];
552             AALo[j] = AAUp[j]/v[nz];
553           }
554           offset+=nz;
555         }
556       }
557 
558       /* allocate space for the triangular factor information */
559       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
560 
561       /* Create the matrix description */
562       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSP(stat);
563       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat);
564       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat);
565       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat);
566       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSP(stat);
567 
568       /* Create the solve analysis information */
569       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSP(stat);
570 
571       /* set the operation */
572       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
573 
574       /* set the matrix */
575       upTriFactor->csrMat = new CsrMatrix;
576       upTriFactor->csrMat->num_rows = A->rmap->n;
577       upTriFactor->csrMat->num_cols = A->cmap->n;
578       upTriFactor->csrMat->num_entries = a->nz;
579 
580       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
581       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
582 
583       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
584       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
585 
586       upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
587       upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
588 
589       /* perform the solve analysis */
590       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
591                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
592                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
593                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSP(stat);
594 
595       /* assign the pointer. Is this really necessary? */
596       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
597 
598       /* allocate space for the triangular factor information */
599       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
600 
601       /* Create the matrix description */
602       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSP(stat);
603       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat);
604       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat);
605       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat);
606       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat);
607 
608       /* Create the solve analysis information */
609       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSP(stat);
610 
611       /* set the operation */
612       loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
613 
614       /* set the matrix */
615       loTriFactor->csrMat = new CsrMatrix;
616       loTriFactor->csrMat->num_rows = A->rmap->n;
617       loTriFactor->csrMat->num_cols = A->cmap->n;
618       loTriFactor->csrMat->num_entries = a->nz;
619 
620       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
621       loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
622 
623       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
624       loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
625 
626       loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
627       loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
628 
629       /* perform the solve analysis */
630       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
631                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
632                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
633                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSP(stat);
634 
635       /* assign the pointer. Is this really necessary? */
636       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
637 
638       A->valid_GPU_matrix = PETSC_CUSP_BOTH;
639       ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr);
640       ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr);
641       ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr);
642       ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr);
643     } catch(char *ex) {
644       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
645     }
646   }
647   PetscFunctionReturn(0);
648 }
649 
650 #undef __FUNCT__
651 #define __FUNCT__ "MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU"
652 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
653 {
654   PetscErrorCode               ierr;
655   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
656   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
657   IS                           ip = a->row;
658   const PetscInt               *rip;
659   PetscBool                    perm_identity;
660   PetscInt                     n = A->rmap->n;
661 
662   PetscFunctionBegin;
663   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
664   cusparseTriFactors->workVector = new THRUSTARRAY;
665   cusparseTriFactors->workVector->resize(n);
666   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
667 
668   /*lower triangular indices */
669   ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
670   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
671   if (!perm_identity) {
672     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
673     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
674     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
675     cusparseTriFactors->cpermIndices->assign(rip, rip+n);
676   }
677   ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
678   PetscFunctionReturn(0);
679 }
680 
681 #undef __FUNCT__
682 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJCUSPARSE"
683 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
684 {
685   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
686   IS             isrow = b->row,iscol = b->col;
687   PetscBool      row_identity,col_identity;
688   PetscErrorCode ierr;
689 
690   PetscFunctionBegin;
691   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
692   /* determine which version of MatSolve needs to be used. */
693   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
694   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
695   if (row_identity && col_identity) {
696     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
697     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
698   } else {
699     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
700     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
701   }
702 
703   /* get the triangular factors */
704   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
705   PetscFunctionReturn(0);
706 }
707 
708 #undef __FUNCT__
709 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqAIJCUSPARSE"
710 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
711 {
712   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
713   IS             ip = b->row;
714   PetscBool      perm_identity;
715   PetscErrorCode ierr;
716 
717   PetscFunctionBegin;
718   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
719 
720   /* determine which version of MatSolve needs to be used. */
721   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
722   if (perm_identity) {
723     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
724     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
725   } else {
726     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
727     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
728   }
729 
730   /* get the triangular factors */
731   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
732   PetscFunctionReturn(0);
733 }
734 
735 #undef __FUNCT__
736 #define __FUNCT__ "MatSeqAIJCUSPARSEAnalyzeTransposeForSolve"
737 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
738 {
739   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
740   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
741   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
742   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
743   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
744   cusparseStatus_t                  stat;
745   cusparseIndexBase_t               indexBase;
746   cusparseMatrixType_t              matrixType;
747   cusparseFillMode_t                fillMode;
748   cusparseDiagType_t                diagType;
749 
750   PetscFunctionBegin;
751 
752   /*********************************************/
753   /* Now the Transpose of the Lower Tri Factor */
754   /*********************************************/
755 
756   /* allocate space for the transpose of the lower triangular factor */
757   loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
758 
759   /* set the matrix descriptors of the lower triangular factor */
760   matrixType = cusparseGetMatType(loTriFactor->descr);
761   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
762   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
763     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
764   diagType = cusparseGetMatDiagType(loTriFactor->descr);
765 
766   /* Create the matrix description */
767   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSP(stat);
768   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSP(stat);
769   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSP(stat);
770   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSP(stat);
771   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSP(stat);
772 
773   /* Create the solve analysis information */
774   stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUSP(stat);
775 
776   /* set the operation */
777   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
778 
779   /* allocate GPU space for the CSC of the lower triangular factor*/
780   loTriFactorT->csrMat = new CsrMatrix;
781   loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows;
782   loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols;
783   loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
784   loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1);
785   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries);
786   loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries);
787 
788   /* compute the transpose of the lower triangular factor, i.e. the CSC */
789   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
790                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
791                           loTriFactor->csrMat->values->data().get(),
792                           loTriFactor->csrMat->row_offsets->data().get(),
793                           loTriFactor->csrMat->column_indices->data().get(),
794                           loTriFactorT->csrMat->values->data().get(),
795                           loTriFactorT->csrMat->column_indices->data().get(),
796                           loTriFactorT->csrMat->row_offsets->data().get(),
797                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat);
798 
799   /* perform the solve analysis on the transposed matrix */
800   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
801                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries,
802                            loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
803                            loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(),
804                            loTriFactorT->solveInfo);CHKERRCUSP(stat);
805 
806   /* assign the pointer. Is this really necessary? */
807   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
808 
809   /*********************************************/
810   /* Now the Transpose of the Upper Tri Factor */
811   /*********************************************/
812 
813   /* allocate space for the transpose of the upper triangular factor */
814   upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
815 
816   /* set the matrix descriptors of the upper triangular factor */
817   matrixType = cusparseGetMatType(upTriFactor->descr);
818   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
819   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
820     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
821   diagType = cusparseGetMatDiagType(upTriFactor->descr);
822 
823   /* Create the matrix description */
824   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSP(stat);
825   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSP(stat);
826   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSP(stat);
827   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSP(stat);
828   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSP(stat);
829 
830   /* Create the solve analysis information */
831   stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUSP(stat);
832 
833   /* set the operation */
834   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
835 
836   /* allocate GPU space for the CSC of the upper triangular factor*/
837   upTriFactorT->csrMat = new CsrMatrix;
838   upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows;
839   upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols;
840   upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
841   upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1);
842   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries);
843   upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries);
844 
845   /* compute the transpose of the upper triangular factor, i.e. the CSC */
846   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
847                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
848                           upTriFactor->csrMat->values->data().get(),
849                           upTriFactor->csrMat->row_offsets->data().get(),
850                           upTriFactor->csrMat->column_indices->data().get(),
851                           upTriFactorT->csrMat->values->data().get(),
852                           upTriFactorT->csrMat->column_indices->data().get(),
853                           upTriFactorT->csrMat->row_offsets->data().get(),
854                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat);
855 
856   /* perform the solve analysis on the transposed matrix */
857   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
858                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries,
859                            upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
860                            upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(),
861                            upTriFactorT->solveInfo);CHKERRCUSP(stat);
862 
863   /* assign the pointer. Is this really necessary? */
864   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
865   PetscFunctionReturn(0);
866 }
867 
868 #undef __FUNCT__
869 #define __FUNCT__ "MatSeqAIJCUSPARSEGenerateTransposeForMult"
870 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
871 {
872   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
873   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
874   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
875   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
876   cusparseStatus_t             stat;
877   cusparseIndexBase_t          indexBase;
878   cudaError_t                  err;
879 
880   PetscFunctionBegin;
881 
882   /* allocate space for the triangular factor information */
883   matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
884   stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSP(stat);
885   indexBase = cusparseGetMatIndexBase(matstruct->descr);
886   stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSP(stat);
887   stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSP(stat);
888 
889   /* set alpha and beta */
890   err = cudaMalloc((void **)&(matstructT->alpha),sizeof(PetscScalar));CHKERRCUSP(err);
891   err = cudaMemcpy(matstructT->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err);
892   err = cudaMalloc((void **)&(matstructT->beta),sizeof(PetscScalar));CHKERRCUSP(err);
893   err = cudaMemcpy(matstructT->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err);
894   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSP(stat);
895 
896   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
897     CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
898     CsrMatrix *matrixT= new CsrMatrix;
899     matrixT->num_rows = A->rmap->n;
900     matrixT->num_cols = A->cmap->n;
901     matrixT->num_entries = a->nz;
902     matrixT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
903     matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
904     matrixT->values = new THRUSTARRAY(a->nz);
905 
906     /* compute the transpose of the upper triangular factor, i.e. the CSC */
907     indexBase = cusparseGetMatIndexBase(matstruct->descr);
908     stat = cusparse_csr2csc(cusparsestruct->handle, matrix->num_rows,
909                             matrix->num_cols, matrix->num_entries,
910                             matrix->values->data().get(),
911                             matrix->row_offsets->data().get(),
912                             matrix->column_indices->data().get(),
913                             matrixT->values->data().get(),
914                             matrixT->column_indices->data().get(),
915                             matrixT->row_offsets->data().get(),
916                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat);
917 
918     /* assign the pointer */
919     matstructT->mat = matrixT;
920 
921   } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
922 #if CUDA_VERSION>=5000
923     /* First convert HYB to CSR */
924     CsrMatrix *temp= new CsrMatrix;
925     temp->num_rows = A->rmap->n;
926     temp->num_cols = A->cmap->n;
927     temp->num_entries = a->nz;
928     temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
929     temp->column_indices = new THRUSTINTARRAY32(a->nz);
930     temp->values = new THRUSTARRAY(a->nz);
931 
932 
933     stat = cusparse_hyb2csr(cusparsestruct->handle,
934                             matstruct->descr, (cusparseHybMat_t)matstruct->mat,
935                             temp->values->data().get(),
936                             temp->row_offsets->data().get(),
937                             temp->column_indices->data().get());CHKERRCUSP(stat);
938 
939     /* Next, convert CSR to CSC (i.e. the matrix transpose) */
940     CsrMatrix *tempT= new CsrMatrix;
941     tempT->num_rows = A->rmap->n;
942     tempT->num_cols = A->cmap->n;
943     tempT->num_entries = a->nz;
944     tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
945     tempT->column_indices = new THRUSTINTARRAY32(a->nz);
946     tempT->values = new THRUSTARRAY(a->nz);
947 
948     stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
949                             temp->num_cols, temp->num_entries,
950                             temp->values->data().get(),
951                             temp->row_offsets->data().get(),
952                             temp->column_indices->data().get(),
953                             tempT->values->data().get(),
954                             tempT->column_indices->data().get(),
955                             tempT->row_offsets->data().get(),
956                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat);
957 
958     /* Last, convert CSC to HYB */
959     cusparseHybMat_t hybMat;
960     stat = cusparseCreateHybMat(&hybMat);CHKERRCUSP(stat);
961     cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
962       CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
963     stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
964                             matstructT->descr, tempT->values->data().get(),
965                             tempT->row_offsets->data().get(),
966                             tempT->column_indices->data().get(),
967                             hybMat, 0, partition);CHKERRCUSP(stat);
968 
969     /* assign the pointer */
970     matstructT->mat = hybMat;
971 
972     /* delete temporaries */
973     if (tempT) {
974       if (tempT->values) delete (THRUSTARRAY*) tempT->values;
975       if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
976       if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
977       delete (CsrMatrix*) tempT;
978     }
979     if (temp) {
980       if (temp->values) delete (THRUSTARRAY*) temp->values;
981       if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
982       if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
983       delete (CsrMatrix*) temp;
984     }
985 #else
986     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format for the Matrix Transpose (in MatMultTranspose) require CUDA 5.0 or later.");
987 #endif
988   }
989   /* assign the compressed row indices */
990   matstructT->cprowIndices = new THRUSTINTARRAY;
991 
992   /* assign the pointer */
993   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
994   PetscFunctionReturn(0);
995 }
996 
997 #undef __FUNCT__
998 #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE"
999 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1000 {
1001   CUSPARRAY                         *xGPU, *bGPU;
1002   cusparseStatus_t                  stat;
1003   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1004   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1005   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1006   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1007   PetscErrorCode                    ierr;
1008 
1009   PetscFunctionBegin;
1010   /* Analyze the matrix and create the transpose ... on the fly */
1011   if (!loTriFactorT && !upTriFactorT) {
1012     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1013     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1014     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1015   }
1016 
1017   /* Get the GPU pointers */
1018   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1019   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
1020 
1021   /* First, reorder with the row permutation */
1022   thrust::copy(thrust::make_permutation_iterator(bGPU->begin(), cusparseTriFactors->rpermIndices->begin()),
1023                thrust::make_permutation_iterator(bGPU->end(), cusparseTriFactors->rpermIndices->end()),
1024                xGPU->begin());
1025 
1026   /* First, solve U */
1027   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1028                         upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr,
1029                         upTriFactorT->csrMat->values->data().get(),
1030                         upTriFactorT->csrMat->row_offsets->data().get(),
1031                         upTriFactorT->csrMat->column_indices->data().get(),
1032                         upTriFactorT->solveInfo,
1033                         xGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat);
1034 
1035   /* Then, solve L */
1036   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1037                         loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr,
1038                         loTriFactorT->csrMat->values->data().get(),
1039                         loTriFactorT->csrMat->row_offsets->data().get(),
1040                         loTriFactorT->csrMat->column_indices->data().get(),
1041                         loTriFactorT->solveInfo,
1042                         tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat);
1043 
1044   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1045   thrust::copy(thrust::make_permutation_iterator(xGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1046                thrust::make_permutation_iterator(xGPU->end(), cusparseTriFactors->cpermIndices->end()),
1047                tempGPU->begin());
1048 
1049   /* Copy the temporary to the full solution. */
1050   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU->begin());
1051 
1052   /* restore */
1053   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
1054   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1055   ierr = WaitForGPU();CHKERRCUSP(ierr);
1056 
1057   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1058   PetscFunctionReturn(0);
1059 }
1060 
1061 #undef __FUNCT__
1062 #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering"
1063 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1064 {
1065   CUSPARRAY                         *xGPU,*bGPU;
1066   cusparseStatus_t                  stat;
1067   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1068   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1069   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1070   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1071   PetscErrorCode                    ierr;
1072 
1073   PetscFunctionBegin;
1074   /* Analyze the matrix and create the transpose ... on the fly */
1075   if (!loTriFactorT && !upTriFactorT) {
1076     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1077     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1078     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1079   }
1080 
1081   /* Get the GPU pointers */
1082   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1083   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
1084 
1085   /* First, solve U */
1086   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1087                         upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr,
1088                         upTriFactorT->csrMat->values->data().get(),
1089                         upTriFactorT->csrMat->row_offsets->data().get(),
1090                         upTriFactorT->csrMat->column_indices->data().get(),
1091                         upTriFactorT->solveInfo,
1092                         bGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat);
1093 
1094   /* Then, solve L */
1095   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1096                         loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr,
1097                         loTriFactorT->csrMat->values->data().get(),
1098                         loTriFactorT->csrMat->row_offsets->data().get(),
1099                         loTriFactorT->csrMat->column_indices->data().get(),
1100                         loTriFactorT->solveInfo,
1101                         tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat);
1102 
1103   /* restore */
1104   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
1105   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1106   ierr = WaitForGPU();CHKERRCUSP(ierr);
1107   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1108   PetscFunctionReturn(0);
1109 }
1110 
1111 #undef __FUNCT__
1112 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE"
1113 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1114 {
1115   CUSPARRAY                         *xGPU,*bGPU;
1116   cusparseStatus_t                  stat;
1117   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1118   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1119   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1120   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1121   PetscErrorCode                    ierr;
1122 
1123   PetscFunctionBegin;
1124   /* Get the GPU pointers */
1125   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1126   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
1127 
1128   /* First, reorder with the row permutation */
1129   thrust::copy(thrust::make_permutation_iterator(bGPU->begin(), cusparseTriFactors->rpermIndices->begin()),
1130                thrust::make_permutation_iterator(bGPU->end(), cusparseTriFactors->rpermIndices->end()),
1131                xGPU->begin());
1132 
1133   /* Next, solve L */
1134   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1135                         loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr,
1136                         loTriFactor->csrMat->values->data().get(),
1137                         loTriFactor->csrMat->row_offsets->data().get(),
1138                         loTriFactor->csrMat->column_indices->data().get(),
1139                         loTriFactor->solveInfo,
1140                         xGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat);
1141 
1142   /* Then, solve U */
1143   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1144                         upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr,
1145                         upTriFactor->csrMat->values->data().get(),
1146                         upTriFactor->csrMat->row_offsets->data().get(),
1147                         upTriFactor->csrMat->column_indices->data().get(),
1148                         upTriFactor->solveInfo,
1149                         tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat);
1150 
1151   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1152   thrust::copy(thrust::make_permutation_iterator(xGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1153                thrust::make_permutation_iterator(xGPU->end(), cusparseTriFactors->cpermIndices->end()),
1154                tempGPU->begin());
1155 
1156   /* Copy the temporary to the full solution. */
1157   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU->begin());
1158 
1159   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
1160   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1161   ierr = WaitForGPU();CHKERRCUSP(ierr);
1162   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1163   PetscFunctionReturn(0);
1164 }
1165 
1166 #undef __FUNCT__
1167 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE_NaturalOrdering"
1168 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1169 {
1170   CUSPARRAY                         *xGPU,*bGPU;
1171   cusparseStatus_t                  stat;
1172   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1173   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1174   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1175   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1176   PetscErrorCode                    ierr;
1177 
1178   PetscFunctionBegin;
1179   /* Get the GPU pointers */
1180   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1181   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
1182 
1183   /* First, solve L */
1184   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1185                         loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr,
1186                         loTriFactor->csrMat->values->data().get(),
1187                         loTriFactor->csrMat->row_offsets->data().get(),
1188                         loTriFactor->csrMat->column_indices->data().get(),
1189                         loTriFactor->solveInfo,
1190                         bGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat);
1191 
1192   /* Next, solve U */
1193   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1194                         upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr,
1195                         upTriFactor->csrMat->values->data().get(),
1196                         upTriFactor->csrMat->row_offsets->data().get(),
1197                         upTriFactor->csrMat->column_indices->data().get(),
1198                         upTriFactor->solveInfo,
1199                         tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat);
1200 
1201   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
1202   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1203   ierr = WaitForGPU();CHKERRCUSP(ierr);
1204   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1205   PetscFunctionReturn(0);
1206 }
1207 
1208 #undef __FUNCT__
1209 #define __FUNCT__ "MatSeqAIJCUSPARSECopyToGPU"
1210 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1211 {
1212 
1213   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1214   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1215   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1216   PetscInt                     m = A->rmap->n,*ii,*ridx;
1217   PetscErrorCode               ierr;
1218   cusparseStatus_t             stat;
1219   cudaError_t                  err;
1220 
1221   PetscFunctionBegin;
1222   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
1223     ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1224     Mat_SeqAIJCUSPARSEMultStruct_Destroy(&matstruct,cusparsestruct->format);
1225     try {
1226       cusparsestruct->nonzerorow=0;
1227       for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0);
1228 
1229       if (a->compressedrow.use) {
1230         m    = a->compressedrow.nrows;
1231         ii   = a->compressedrow.i;
1232         ridx = a->compressedrow.rindex;
1233       } else {
1234         /* Forcing compressed row on the GPU */
1235         int k=0;
1236         ierr = PetscMalloc1((cusparsestruct->nonzerorow+1), &ii);CHKERRQ(ierr);
1237         ierr = PetscMalloc1((cusparsestruct->nonzerorow), &ridx);CHKERRQ(ierr);
1238         ii[0]=0;
1239         for (int j = 0; j<m; j++) {
1240           if ((a->i[j+1]-a->i[j])>0) {
1241             ii[k]  = a->i[j];
1242             ridx[k]= j;
1243             k++;
1244           }
1245         }
1246         ii[cusparsestruct->nonzerorow] = a->nz;
1247         m = cusparsestruct->nonzerorow;
1248       }
1249 
1250       /* allocate space for the triangular factor information */
1251       matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1252       stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSP(stat);
1253       stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat);
1254       stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSP(stat);
1255 
1256       err = cudaMalloc((void **)&(matstruct->alpha),sizeof(PetscScalar));CHKERRCUSP(err);
1257       err = cudaMemcpy(matstruct->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err);
1258       err = cudaMalloc((void **)&(matstruct->beta),sizeof(PetscScalar));CHKERRCUSP(err);
1259       err = cudaMemcpy(matstruct->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err);
1260       stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSP(stat);
1261 
1262       /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1263       if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1264         /* set the matrix */
1265 	CsrMatrix *matrix= new CsrMatrix;
1266 	matrix->num_rows = m;
1267 	matrix->num_cols = A->cmap->n;
1268 	matrix->num_entries = a->nz;
1269 	matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1270 	matrix->row_offsets->assign(ii, ii + m+1);
1271 
1272 	matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1273 	matrix->column_indices->assign(a->j, a->j+a->nz);
1274 
1275 	matrix->values = new THRUSTARRAY(a->nz);
1276 	matrix->values->assign(a->a, a->a+a->nz);
1277 
1278         /* assign the pointer */
1279         matstruct->mat = matrix;
1280 
1281       } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1282 #if CUDA_VERSION>=4020
1283 	CsrMatrix *matrix= new CsrMatrix;
1284 	matrix->num_rows = m;
1285 	matrix->num_cols = A->cmap->n;
1286 	matrix->num_entries = a->nz;
1287 	matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1288 	matrix->row_offsets->assign(ii, ii + m+1);
1289 
1290 	matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1291 	matrix->column_indices->assign(a->j, a->j+a->nz);
1292 
1293 	matrix->values = new THRUSTARRAY(a->nz);
1294 	matrix->values->assign(a->a, a->a+a->nz);
1295 
1296         cusparseHybMat_t hybMat;
1297         stat = cusparseCreateHybMat(&hybMat);CHKERRCUSP(stat);
1298         cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1299           CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1300         stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols,
1301                                 matstruct->descr, matrix->values->data().get(),
1302                                 matrix->row_offsets->data().get(),
1303                                 matrix->column_indices->data().get(),
1304                                 hybMat, 0, partition);CHKERRCUSP(stat);
1305         /* assign the pointer */
1306         matstruct->mat = hybMat;
1307 
1308 	if (matrix) {
1309 	  if (matrix->values) delete (THRUSTARRAY*)matrix->values;
1310 	  if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices;
1311 	  if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets;
1312 	  delete (CsrMatrix*)matrix;
1313 	}
1314 #endif
1315       }
1316 
1317       /* assign the compressed row indices */
1318       matstruct->cprowIndices = new THRUSTINTARRAY(m);
1319       matstruct->cprowIndices->assign(ridx,ridx+m);
1320 
1321       /* assign the pointer */
1322       cusparsestruct->mat = matstruct;
1323 
1324       if (!a->compressedrow.use) {
1325         ierr = PetscFree(ii);CHKERRQ(ierr);
1326         ierr = PetscFree(ridx);CHKERRQ(ierr);
1327       }
1328       cusparsestruct->workVector = new THRUSTARRAY;
1329       cusparsestruct->workVector->resize(m);
1330     } catch(char *ex) {
1331       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1332     }
1333     ierr = WaitForGPU();CHKERRCUSP(ierr);
1334 
1335     A->valid_GPU_matrix = PETSC_CUSP_BOTH;
1336 
1337     ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1338   }
1339   PetscFunctionReturn(0);
1340 }
1341 
1342 #undef __FUNCT__
1343 #define __FUNCT__ "MatCreateVecs_SeqAIJCUSPARSE"
1344 static PetscErrorCode MatCreateVecs_SeqAIJCUSPARSE(Mat mat, Vec *right, Vec *left)
1345 {
1346   PetscErrorCode ierr;
1347   PetscInt rbs,cbs;
1348 
1349   PetscFunctionBegin;
1350   ierr = MatGetBlockSizes(mat,&rbs,&cbs);CHKERRQ(ierr);
1351   if (right) {
1352     ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr);
1353     ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
1354     ierr = VecSetBlockSize(*right,cbs);CHKERRQ(ierr);
1355     ierr = VecSetType(*right,VECSEQCUSP);CHKERRQ(ierr);
1356     ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr);
1357   }
1358   if (left) {
1359     ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr);
1360     ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
1361     ierr = VecSetBlockSize(*left,rbs);CHKERRQ(ierr);
1362     ierr = VecSetType(*left,VECSEQCUSP);CHKERRQ(ierr);
1363     ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr);
1364   }
1365   PetscFunctionReturn(0);
1366 }
1367 
1368 struct VecCUSPPlusEquals
1369 {
1370   template <typename Tuple>
1371   __host__ __device__
1372   void operator()(Tuple t)
1373   {
1374     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1375   }
1376 };
1377 
1378 #undef __FUNCT__
1379 #define __FUNCT__ "MatMult_SeqAIJCUSPARSE"
1380 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1381 {
1382   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1383   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1384   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1385   CUSPARRAY                    *xarray,*yarray;
1386   PetscErrorCode               ierr;
1387   cusparseStatus_t             stat;
1388 
1389   PetscFunctionBegin;
1390   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
1391      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
1392   ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1393   ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
1394   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1395     CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1396     stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1397                              mat->num_rows, mat->num_cols, mat->num_entries,
1398                              matstruct->alpha, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(),
1399                              mat->column_indices->data().get(), xarray->data().get(), matstruct->beta,
1400                              yarray->data().get());CHKERRCUSP(stat);
1401   } else {
1402 #if CUDA_VERSION>=4020
1403     cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1404     stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1405                              matstruct->alpha, matstruct->descr, hybMat,
1406                              xarray->data().get(), matstruct->beta,
1407                              yarray->data().get());CHKERRCUSP(stat);
1408 #endif
1409   }
1410   ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1411   ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
1412   if (!cusparsestruct->stream) {
1413     ierr = WaitForGPU();CHKERRCUSP(ierr);
1414   }
1415   ierr = PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr);
1416   PetscFunctionReturn(0);
1417 }
1418 
1419 #undef __FUNCT__
1420 #define __FUNCT__ "MatMultTranspose_SeqAIJCUSPARSE"
1421 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1422 {
1423   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1424   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1425   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1426   CUSPARRAY                    *xarray,*yarray;
1427   PetscErrorCode               ierr;
1428   cusparseStatus_t             stat;
1429 
1430   PetscFunctionBegin;
1431   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
1432      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
1433   if (!matstructT) {
1434     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1435     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1436   }
1437   ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1438   ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
1439 
1440   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1441     CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1442     stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1443                              mat->num_rows, mat->num_cols,
1444                              mat->num_entries, matstructT->alpha, matstructT->descr,
1445                              mat->values->data().get(), mat->row_offsets->data().get(),
1446                              mat->column_indices->data().get(), xarray->data().get(), matstructT->beta,
1447                              yarray->data().get());CHKERRCUSP(stat);
1448   } else {
1449 #if CUDA_VERSION>=4020
1450     cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1451     stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1452                              matstructT->alpha, matstructT->descr, hybMat,
1453                              xarray->data().get(), matstructT->beta,
1454                              yarray->data().get());CHKERRCUSP(stat);
1455 #endif
1456   }
1457   ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1458   ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
1459   if (!cusparsestruct->stream) {
1460     ierr = WaitForGPU();CHKERRCUSP(ierr);
1461   }
1462   ierr = PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr);
1463   PetscFunctionReturn(0);
1464 }
1465 
1466 
1467 #undef __FUNCT__
1468 #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE"
1469 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1470 {
1471   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1472   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1473   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1474   CUSPARRAY                    *xarray,*yarray,*zarray;
1475   PetscErrorCode               ierr;
1476   cusparseStatus_t             stat;
1477 
1478   PetscFunctionBegin;
1479   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
1480      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
1481   try {
1482     ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr);
1483     ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1484     ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr);
1485     ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
1486 
1487     /* multiply add */
1488     if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1489       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1490       /* here we need to be careful to set the number of rows in the multiply to the
1491 	 number of compressed rows in the matrix ... which is equivalent to the
1492 	 size of the workVector */
1493       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1494                                mat->num_rows, mat->num_cols,
1495                                mat->num_entries, matstruct->alpha, matstruct->descr,
1496                                mat->values->data().get(), mat->row_offsets->data().get(),
1497                                mat->column_indices->data().get(), xarray->data().get(), matstruct->beta,
1498                                cusparsestruct->workVector->data().get());CHKERRCUSP(stat);
1499     } else {
1500 #if CUDA_VERSION>=4020
1501       cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1502       if (cusparsestruct->workVector->size()) {
1503 	stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1504 				 matstruct->alpha, matstruct->descr, hybMat,
1505 				 xarray->data().get(), matstruct->beta,
1506 				 cusparsestruct->workVector->data().get());CHKERRCUSP(stat);
1507       }
1508 #endif
1509     }
1510 
1511     /* scatter the data from the temporary into the full vector with a += operation */
1512     thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstruct->cprowIndices->begin()))),
1513 		     thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(),
1514 		     VecCUSPPlusEquals());
1515     ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1516     ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr);
1517     ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
1518 
1519   } catch(char *ex) {
1520     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1521   }
1522   ierr = WaitForGPU();CHKERRCUSP(ierr);
1523   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1524   PetscFunctionReturn(0);
1525 }
1526 
1527 #undef __FUNCT__
1528 #define __FUNCT__ "MatMultTransposeAdd_SeqAIJCUSPARSE"
1529 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1530 {
1531   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1532   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1533   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1534   CUSPARRAY                    *xarray,*yarray,*zarray;
1535   PetscErrorCode               ierr;
1536   cusparseStatus_t             stat;
1537 
1538   PetscFunctionBegin;
1539   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
1540      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
1541   if (!matstructT) {
1542     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1543     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1544   }
1545 
1546   try {
1547     ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr);
1548     ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1549     ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr);
1550     ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
1551 
1552     /* multiply add with matrix transpose */
1553     if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1554       CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1555       /* here we need to be careful to set the number of rows in the multiply to the
1556 	 number of compressed rows in the matrix ... which is equivalent to the
1557 	 size of the workVector */
1558       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1559                                mat->num_rows, mat->num_cols,
1560 			       mat->num_entries, matstructT->alpha, matstructT->descr,
1561                                mat->values->data().get(), mat->row_offsets->data().get(),
1562                                mat->column_indices->data().get(), xarray->data().get(), matstructT->beta,
1563                                cusparsestruct->workVector->data().get());CHKERRCUSP(stat);
1564     } else {
1565 #if CUDA_VERSION>=4020
1566       cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1567       if (cusparsestruct->workVector->size()) {
1568 	stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1569 				 matstructT->alpha, matstructT->descr, hybMat,
1570 				 xarray->data().get(), matstructT->beta,
1571 				 cusparsestruct->workVector->data().get());CHKERRCUSP(stat);
1572       }
1573 #endif
1574     }
1575 
1576     /* scatter the data from the temporary into the full vector with a += operation */
1577     thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstructT->cprowIndices->begin()))),
1578 		     thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstructT->cprowIndices->begin()))) + cusparsestruct->workVector->size(),
1579 		     VecCUSPPlusEquals());
1580 
1581     ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1582     ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr);
1583     ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
1584 
1585   } catch(char *ex) {
1586     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1587   }
1588   ierr = WaitForGPU();CHKERRCUSP(ierr);
1589   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1590   PetscFunctionReturn(0);
1591 }
1592 
1593 #undef __FUNCT__
1594 #define __FUNCT__ "MatAssemblyEnd_SeqAIJCUSPARSE"
1595 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
1596 {
1597   PetscErrorCode ierr;
1598 
1599   PetscFunctionBegin;
1600   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
1601   if (A->factortype==MAT_FACTOR_NONE) {
1602     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1603   }
1604   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
1605   A->ops->mult             = MatMult_SeqAIJCUSPARSE;
1606   A->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1607   A->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1608   A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1609   PetscFunctionReturn(0);
1610 }
1611 
1612 /* --------------------------------------------------------------------------------*/
1613 #undef __FUNCT__
1614 #define __FUNCT__ "MatCreateSeqAIJCUSPARSE"
1615 /*@
1616    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
1617    (the default parallel PETSc format). This matrix will ultimately pushed down
1618    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
1619    assembly performance the user should preallocate the matrix storage by setting
1620    the parameter nz (or the array nnz).  By setting these parameters accurately,
1621    performance during matrix assembly can be increased by more than a factor of 50.
1622 
1623    Collective on MPI_Comm
1624 
1625    Input Parameters:
1626 +  comm - MPI communicator, set to PETSC_COMM_SELF
1627 .  m - number of rows
1628 .  n - number of columns
1629 .  nz - number of nonzeros per row (same for all rows)
1630 -  nnz - array containing the number of nonzeros in the various rows
1631          (possibly different for each row) or NULL
1632 
1633    Output Parameter:
1634 .  A - the matrix
1635 
1636    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1637    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1638    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
1639 
1640    Notes:
1641    If nnz is given then nz is ignored
1642 
1643    The AIJ format (also called the Yale sparse matrix format or
1644    compressed row storage), is fully compatible with standard Fortran 77
1645    storage.  That is, the stored row and column indices can begin at
1646    either one (as in Fortran) or zero.  See the users' manual for details.
1647 
1648    Specify the preallocated storage with either nz or nnz (not both).
1649    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1650    allocation.  For large problems you MUST preallocate memory or you
1651    will get TERRIBLE performance, see the users' manual chapter on matrices.
1652 
1653    By default, this format uses inodes (identical nodes) when possible, to
1654    improve numerical efficiency of matrix-vector products and solves. We
1655    search for consecutive rows with the same nonzero structure, thereby
1656    reusing matrix information to achieve increased efficiency.
1657 
1658    Level: intermediate
1659 
1660 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
1661 @*/
1662 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1663 {
1664   PetscErrorCode ierr;
1665 
1666   PetscFunctionBegin;
1667   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1668   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
1669   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1670   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
1671   PetscFunctionReturn(0);
1672 }
1673 
1674 #undef __FUNCT__
1675 #define __FUNCT__ "MatDestroy_SeqAIJCUSPARSE"
1676 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
1677 {
1678   PetscErrorCode   ierr;
1679 
1680   PetscFunctionBegin;
1681   if (A->factortype==MAT_FACTOR_NONE) {
1682     if (A->valid_GPU_matrix != PETSC_CUSP_UNALLOCATED) {
1683       ierr = Mat_SeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
1684     }
1685   } else {
1686     ierr = Mat_SeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
1687   }
1688   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
1689   PetscFunctionReturn(0);
1690 }
1691 
1692 #undef __FUNCT__
1693 #define __FUNCT__ "MatCreate_SeqAIJCUSPARSE"
1694 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
1695 {
1696   PetscErrorCode ierr;
1697   cusparseStatus_t stat;
1698   cusparseHandle_t handle=0;
1699 
1700   PetscFunctionBegin;
1701   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
1702   if (B->factortype==MAT_FACTOR_NONE) {
1703     /* you cannot check the inode.use flag here since the matrix was just created.
1704        now build a GPU matrix data structure */
1705     B->spptr = new Mat_SeqAIJCUSPARSE;
1706     ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat          = 0;
1707     ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0;
1708     ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector   = 0;
1709     ((Mat_SeqAIJCUSPARSE*)B->spptr)->format       = MAT_CUSPARSE_CSR;
1710     ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream       = 0;
1711     ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle       = 0;
1712     stat = cusparseCreate(&handle);CHKERRCUSP(stat);
1713     ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle       = handle;
1714     ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream       = 0;
1715   } else {
1716     /* NEXT, set the pointers to the triangular factors */
1717     B->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1718     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr          = 0;
1719     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr          = 0;
1720     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0;
1721     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0;
1722     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices            = 0;
1723     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices            = 0;
1724     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector              = 0;
1725     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle                  = 0;
1726     stat = cusparseCreate(&handle);CHKERRCUSP(stat);
1727     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle                  = handle;
1728     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz                     = 0;
1729   }
1730 
1731   /* Here we overload MatGetFactor_petsc_C which enables -mat_type aijcusparse to use the
1732      default cusparse tri solve. Note the difference with the implementation in
1733      MatCreate_SeqAIJCUSP in ../seqcusp/aijcusp.cu */
1734   ierr = PetscObjectComposeFunction((PetscObject)B,"MatGetFactor_petsc_C",MatGetFactor_seqaij_cusparse);CHKERRQ(ierr);
1735 
1736   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
1737   B->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
1738   B->ops->getvecs          = MatCreateVecs_SeqAIJCUSPARSE;
1739   B->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
1740   B->ops->mult             = MatMult_SeqAIJCUSPARSE;
1741   B->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1742   B->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1743   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1744 
1745   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1746 
1747   B->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED;
1748 
1749   ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
1750   PetscFunctionReturn(0);
1751 }
1752 
1753 /*M
1754    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
1755 
1756    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
1757    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
1758    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
1759 
1760    Options Database Keys:
1761 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
1762 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1763 .  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1764 
1765   Level: beginner
1766 
1767 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
1768 M*/
1769 
1770 #undef __FUNCT__
1771 #define __FUNCT__ "Mat_SeqAIJCUSPARSE_Destroy"
1772 static PetscErrorCode Mat_SeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
1773 {
1774   cusparseStatus_t stat;
1775   cusparseHandle_t handle;
1776 
1777   PetscFunctionBegin;
1778   if (*cusparsestruct) {
1779     Mat_SeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);
1780     Mat_SeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);
1781     delete (*cusparsestruct)->workVector;
1782     if (handle = (*cusparsestruct)->handle) {
1783       stat = cusparseDestroy(handle);CHKERRCUSP(stat);
1784     }
1785     delete *cusparsestruct;
1786     *cusparsestruct = 0;
1787   }
1788   PetscFunctionReturn(0);
1789 }
1790 
1791 #undef __FUNCT__
1792 #define __FUNCT__ "CsrMatrix_Destroy"
1793 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
1794 {
1795   PetscFunctionBegin;
1796   if (*mat) {
1797     delete (*mat)->values;
1798     delete (*mat)->column_indices;
1799     delete (*mat)->row_offsets;
1800     delete *mat;
1801     *mat = 0;
1802   }
1803   PetscFunctionReturn(0);
1804 }
1805 
1806 #undef __FUNCT__
1807 #define __FUNCT__ "Mat_SeqAIJCUSPARSETriFactorStruct_Destroy"
1808 static PetscErrorCode Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
1809 {
1810   cusparseStatus_t stat;
1811   PetscErrorCode   ierr;
1812 
1813   PetscFunctionBegin;
1814   if (*trifactor) {
1815     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSP(stat); }
1816     if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUSP(stat); }
1817     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
1818     delete *trifactor;
1819     *trifactor = 0;
1820   }
1821   PetscFunctionReturn(0);
1822 }
1823 
1824 #undef __FUNCT__
1825 #define __FUNCT__ "Mat_SeqAIJCUSPARSEMultStruct_Destroy"
1826 static PetscErrorCode Mat_SeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
1827 {
1828   CsrMatrix        *mat;
1829   cusparseStatus_t stat;
1830   cudaError_t      err;
1831 
1832   PetscFunctionBegin;
1833   if (*matstruct) {
1834     if ((*matstruct)->mat) {
1835       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
1836         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
1837         stat = cusparseDestroyHybMat(hybMat);CHKERRCUSP(stat);
1838       } else {
1839         mat = (CsrMatrix*)(*matstruct)->mat;
1840         CsrMatrix_Destroy(&mat);
1841       }
1842     }
1843     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSP(stat); }
1844     delete (*matstruct)->cprowIndices;
1845     if ((*matstruct)->alpha) { err=cudaFree((*matstruct)->alpha);CHKERRCUSP(err); }
1846     if ((*matstruct)->beta) { err=cudaFree((*matstruct)->beta);CHKERRCUSP(err); }
1847     delete *matstruct;
1848     *matstruct = 0;
1849   }
1850   PetscFunctionReturn(0);
1851 }
1852 
1853 #undef __FUNCT__
1854 #define __FUNCT__ "Mat_SeqAIJCUSPARSETriFactors_Destroy"
1855 static PetscErrorCode Mat_SeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
1856 {
1857   cusparseHandle_t handle;
1858   cusparseStatus_t stat;
1859 
1860   PetscFunctionBegin;
1861   if (*trifactors) {
1862     Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->loTriFactorPtr);
1863     Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->upTriFactorPtr);
1864     Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);
1865     Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);
1866     delete (*trifactors)->rpermIndices;
1867     delete (*trifactors)->cpermIndices;
1868     delete (*trifactors)->workVector;
1869     if (handle = (*trifactors)->handle) {
1870       stat = cusparseDestroy(handle);CHKERRCUSP(stat);
1871     }
1872     delete *trifactors;
1873     *trifactors = 0;
1874   }
1875   PetscFunctionReturn(0);
1876 }
1877 
1878