xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision bf0c4ff7f024e5604c03be42c12927a3985c36f7)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 
7 #include <petscconf.h>
8 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
9 #include <../src/mat/impls/sbaij/seq/sbaij.h>
10 #include <../src/vec/vec/impls/dvecimpl.h>
11 #include <petsc/private/vecimpl.h>
12 #undef VecType
13 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
14 
15 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
16 
17 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
18 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
19 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
20 
21 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
22 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
23 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
24 
25 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
26 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
27 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
28 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
29 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptions *PetscOptionsObject,Mat);
30 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
31 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
32 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
33 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
34 
35 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
36 static PetscErrorCode Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
37 static PetscErrorCode Mat_SeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
38 static PetscErrorCode Mat_SeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
39 static PetscErrorCode Mat_SeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
40 
41 #undef __FUNCT__
42 #define __FUNCT__ "MatCUSPARSESetStream"
43 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
44 {
45   cusparseStatus_t   stat;
46   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
47 
48   PetscFunctionBegin;
49   cusparsestruct->stream = stream;
50   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSP(stat);
51   PetscFunctionReturn(0);
52 }
53 
54 #undef __FUNCT__
55 #define __FUNCT__ "MatCUSPARSESetHandle"
56 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
57 {
58   cusparseStatus_t   stat;
59   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
60 
61   PetscFunctionBegin;
62   if (cusparsestruct->handle)
63     stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSP(stat);
64   cusparsestruct->handle = handle;
65   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSP(stat);
66   PetscFunctionReturn(0);
67 }
68 
69 #undef __FUNCT__
70 #define __FUNCT__ "MatCUSPARSEClearHandle"
71 PetscErrorCode MatCUSPARSEClearHandle(Mat A)
72 {
73   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
74   PetscFunctionBegin;
75   if (cusparsestruct->handle)
76     cusparsestruct->handle = 0;
77   PetscFunctionReturn(0);
78 }
79 
80 #undef __FUNCT__
81 #define __FUNCT__ "MatFactorGetSolverPackage_seqaij_cusparse"
82 PetscErrorCode MatFactorGetSolverPackage_seqaij_cusparse(Mat A,const MatSolverPackage *type)
83 {
84   PetscFunctionBegin;
85   *type = MATSOLVERCUSPARSE;
86   PetscFunctionReturn(0);
87 }
88 
89 /*MC
90   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
91   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
92   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
93   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
94   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
95   algorithms are not recommended. This class does NOT support direct solver operations.
96 
97   Level: beginner
98 
99 .seealso: PCFactorSetMatSolverPackage(), MatSolverPackage, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
100 M*/
101 
102 #undef __FUNCT__
103 #define __FUNCT__ "MatGetFactor_seqaijcusparse_cusparse"
104 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
105 {
106   PetscErrorCode ierr;
107   PetscInt       n = A->rmap->n;
108 
109   PetscFunctionBegin;
110   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
111   (*B)->factortype = ftype;
112   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
113   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
114 
115   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
116     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
117     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
118     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
119   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
120     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
121     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
122   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
123 
124   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
125   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverPackage_C",MatFactorGetSolverPackage_seqaij_cusparse);CHKERRQ(ierr);
126   PetscFunctionReturn(0);
127 }
128 
129 #undef __FUNCT__
130 #define __FUNCT__ "MatCUSPARSESetFormat_SeqAIJCUSPARSE"
131 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
132 {
133   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
134 
135   PetscFunctionBegin;
136 #if CUDA_VERSION>=4020
137   switch (op) {
138   case MAT_CUSPARSE_MULT:
139     cusparsestruct->format = format;
140     break;
141   case MAT_CUSPARSE_ALL:
142     cusparsestruct->format = format;
143     break;
144   default:
145     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
146   }
147 #else
148   if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB)
149     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format require CUDA 4.2 or later.");
150 #endif
151   PetscFunctionReturn(0);
152 }
153 
154 /*@
155    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
156    operation. Only the MatMult operation can use different GPU storage formats
157    for MPIAIJCUSPARSE matrices.
158    Not Collective
159 
160    Input Parameters:
161 +  A - Matrix of type SEQAIJCUSPARSE
162 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
163 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
164 
165    Output Parameter:
166 
167    Level: intermediate
168 
169 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
170 @*/
171 #undef __FUNCT__
172 #define __FUNCT__ "MatCUSPARSESetFormat"
173 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
174 {
175   PetscErrorCode ierr;
176 
177   PetscFunctionBegin;
178   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
179   ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
180   PetscFunctionReturn(0);
181 }
182 
183 #undef __FUNCT__
184 #define __FUNCT__ "MatSetFromOptions_SeqAIJCUSPARSE"
185 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptions *PetscOptionsObject,Mat A)
186 {
187   PetscErrorCode           ierr;
188   MatCUSPARSEStorageFormat format;
189   PetscBool                flg;
190   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
191 
192   PetscFunctionBegin;
193   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
194   ierr = PetscObjectOptionsBegin((PetscObject)A);
195   if (A->factortype==MAT_FACTOR_NONE) {
196     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
197                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
198     if (flg) {
199       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);
200     }
201   }
202   ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
203                           "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
204   if (flg) {
205     ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
206   }
207   ierr = PetscOptionsEnd();CHKERRQ(ierr);
208   PetscFunctionReturn(0);
209 
210 }
211 
212 #undef __FUNCT__
213 #define __FUNCT__ "MatILUFactorSymbolic_SeqAIJCUSPARSE"
214 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
215 {
216   PetscErrorCode ierr;
217 
218   PetscFunctionBegin;
219   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
220   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
221   PetscFunctionReturn(0);
222 }
223 
224 #undef __FUNCT__
225 #define __FUNCT__ "MatLUFactorSymbolic_SeqAIJCUSPARSE"
226 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
227 {
228   PetscErrorCode ierr;
229 
230   PetscFunctionBegin;
231   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
232   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
233   PetscFunctionReturn(0);
234 }
235 
236 #undef __FUNCT__
237 #define __FUNCT__ "MatICCFactorSymbolic_SeqAIJCUSPARSE"
238 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
239 {
240   PetscErrorCode ierr;
241 
242   PetscFunctionBegin;
243   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
244   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
245   PetscFunctionReturn(0);
246 }
247 
248 #undef __FUNCT__
249 #define __FUNCT__ "MatCholeskyFactorSymbolic_SeqAIJCUSPARSE"
250 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
251 {
252   PetscErrorCode ierr;
253 
254   PetscFunctionBegin;
255   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
256   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
257   PetscFunctionReturn(0);
258 }
259 
260 #undef __FUNCT__
261 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildILULowerTriMatrix"
262 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
263 {
264   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
265   PetscInt                          n = A->rmap->n;
266   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
267   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
268   cusparseStatus_t                  stat;
269   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
270   const MatScalar                   *aa = a->a,*v;
271   PetscInt                          *AiLo, *AjLo;
272   PetscScalar                       *AALo;
273   PetscInt                          i,nz, nzLower, offset, rowOffset;
274   PetscErrorCode                    ierr;
275 
276   PetscFunctionBegin;
277   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
278     try {
279       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
280       nzLower=n+ai[n]-ai[1];
281 
282       /* Allocate Space for the lower triangular matrix */
283       ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
284       ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUSP(ierr);
285       ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUSP(ierr);
286 
287       /* Fill the lower triangular matrix */
288       AiLo[0]  = (PetscInt) 0;
289       AiLo[n]  = nzLower;
290       AjLo[0]  = (PetscInt) 0;
291       AALo[0]  = (MatScalar) 1.0;
292       v        = aa;
293       vi       = aj;
294       offset   = 1;
295       rowOffset= 1;
296       for (i=1; i<n; i++) {
297         nz = ai[i+1] - ai[i];
298         /* additional 1 for the term on the diagonal */
299         AiLo[i]    = rowOffset;
300         rowOffset += nz+1;
301 
302         ierr = PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr);
303         ierr = PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
304 
305         offset      += nz;
306         AjLo[offset] = (PetscInt) i;
307         AALo[offset] = (MatScalar) 1.0;
308         offset      += 1;
309 
310         v  += nz;
311         vi += nz;
312       }
313 
314       /* allocate space for the triangular factor information */
315       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
316 
317       /* Create the matrix description */
318       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSP(stat);
319       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat);
320       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat);
321       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSP(stat);
322       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSP(stat);
323 
324       /* Create the solve analysis information */
325       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSP(stat);
326 
327       /* set the operation */
328       loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
329 
330       /* set the matrix */
331       loTriFactor->csrMat = new CsrMatrix;
332       loTriFactor->csrMat->num_rows = n;
333       loTriFactor->csrMat->num_cols = n;
334       loTriFactor->csrMat->num_entries = nzLower;
335 
336       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
337       loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
338 
339       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
340       loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
341 
342       loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
343       loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
344 
345       /* perform the solve analysis */
346       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
347                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
348                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
349                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSP(stat);
350 
351       /* assign the pointer. Is this really necessary? */
352       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
353 
354       ierr = cudaFreeHost(AiLo);CHKERRCUSP(ierr);
355       ierr = cudaFreeHost(AjLo);CHKERRCUSP(ierr);
356       ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr);
357     } catch(char *ex) {
358       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
359     }
360   }
361   PetscFunctionReturn(0);
362 }
363 
364 #undef __FUNCT__
365 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildILUUpperTriMatrix"
366 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
367 {
368   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
369   PetscInt                          n = A->rmap->n;
370   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
371   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
372   cusparseStatus_t                  stat;
373   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
374   const MatScalar                   *aa = a->a,*v;
375   PetscInt                          *AiUp, *AjUp;
376   PetscScalar                       *AAUp;
377   PetscInt                          i,nz, nzUpper, offset;
378   PetscErrorCode                    ierr;
379 
380   PetscFunctionBegin;
381   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
382     try {
383       /* next, figure out the number of nonzeros in the upper triangular matrix. */
384       nzUpper = adiag[0]-adiag[n];
385 
386       /* Allocate Space for the upper triangular matrix */
387       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
388       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr);
389       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr);
390 
391       /* Fill the upper triangular matrix */
392       AiUp[0]=(PetscInt) 0;
393       AiUp[n]=nzUpper;
394       offset = nzUpper;
395       for (i=n-1; i>=0; i--) {
396         v  = aa + adiag[i+1] + 1;
397         vi = aj + adiag[i+1] + 1;
398 
399         /* number of elements NOT on the diagonal */
400         nz = adiag[i] - adiag[i+1]-1;
401 
402         /* decrement the offset */
403         offset -= (nz+1);
404 
405         /* first, set the diagonal elements */
406         AjUp[offset] = (PetscInt) i;
407         AAUp[offset] = 1./v[nz];
408         AiUp[i]      = AiUp[i+1] - (nz+1);
409 
410         ierr = PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr);
411         ierr = PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
412       }
413 
414       /* allocate space for the triangular factor information */
415       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
416 
417       /* Create the matrix description */
418       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSP(stat);
419       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat);
420       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat);
421       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat);
422       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat);
423 
424       /* Create the solve analysis information */
425       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSP(stat);
426 
427       /* set the operation */
428       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
429 
430       /* set the matrix */
431       upTriFactor->csrMat = new CsrMatrix;
432       upTriFactor->csrMat->num_rows = n;
433       upTriFactor->csrMat->num_cols = n;
434       upTriFactor->csrMat->num_entries = nzUpper;
435 
436       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
437       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
438 
439       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
440       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
441 
442       upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
443       upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
444 
445       /* perform the solve analysis */
446       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
447                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
448                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
449                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSP(stat);
450 
451       /* assign the pointer. Is this really necessary? */
452       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
453 
454       ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr);
455       ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr);
456       ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr);
457     } catch(char *ex) {
458       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
459     }
460   }
461   PetscFunctionReturn(0);
462 }
463 
464 #undef __FUNCT__
465 #define __FUNCT__ "MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU"
466 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
467 {
468   PetscErrorCode               ierr;
469   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
470   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
471   IS                           isrow = a->row,iscol = a->icol;
472   PetscBool                    row_identity,col_identity;
473   const PetscInt               *r,*c;
474   PetscInt                     n = A->rmap->n;
475 
476   PetscFunctionBegin;
477   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
478   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
479 
480   cusparseTriFactors->workVector = new THRUSTARRAY;
481   cusparseTriFactors->workVector->resize(n);
482   cusparseTriFactors->nnz=a->nz;
483 
484   A->valid_GPU_matrix = PETSC_CUSP_BOTH;
485   /*lower triangular indices */
486   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
487   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
488   if (!row_identity) {
489     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
490     cusparseTriFactors->rpermIndices->assign(r, r+n);
491   }
492   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
493 
494   /*upper triangular indices */
495   ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
496   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
497   if (!col_identity) {
498     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
499     cusparseTriFactors->cpermIndices->assign(c, c+n);
500   }
501   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
502   PetscFunctionReturn(0);
503 }
504 
505 #undef __FUNCT__
506 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildICCTriMatrices"
507 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
508 {
509   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
510   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
511   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
512   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
513   cusparseStatus_t                  stat;
514   PetscErrorCode                    ierr;
515   PetscInt                          *AiUp, *AjUp;
516   PetscScalar                       *AAUp;
517   PetscScalar                       *AALo;
518   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
519   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
520   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
521   const MatScalar                   *aa = b->a,*v;
522 
523   PetscFunctionBegin;
524   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
525     try {
526       /* Allocate Space for the upper triangular matrix */
527       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
528       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr);
529       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr);
530       ierr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr);
531 
532       /* Fill the upper triangular matrix */
533       AiUp[0]=(PetscInt) 0;
534       AiUp[n]=nzUpper;
535       offset = 0;
536       for (i=0; i<n; i++) {
537         /* set the pointers */
538         v  = aa + ai[i];
539         vj = aj + ai[i];
540         nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
541 
542         /* first, set the diagonal elements */
543         AjUp[offset] = (PetscInt) i;
544         AAUp[offset] = 1.0/v[nz];
545         AiUp[i]      = offset;
546         AALo[offset] = 1.0/v[nz];
547 
548         offset+=1;
549         if (nz>0) {
550           ierr = PetscMemcpy(&(AjUp[offset]), vj, nz*sizeof(PetscInt));CHKERRQ(ierr);
551           ierr = PetscMemcpy(&(AAUp[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
552           for (j=offset; j<offset+nz; j++) {
553             AAUp[j] = -AAUp[j];
554             AALo[j] = AAUp[j]/v[nz];
555           }
556           offset+=nz;
557         }
558       }
559 
560       /* allocate space for the triangular factor information */
561       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
562 
563       /* Create the matrix description */
564       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSP(stat);
565       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat);
566       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat);
567       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat);
568       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSP(stat);
569 
570       /* Create the solve analysis information */
571       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSP(stat);
572 
573       /* set the operation */
574       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
575 
576       /* set the matrix */
577       upTriFactor->csrMat = new CsrMatrix;
578       upTriFactor->csrMat->num_rows = A->rmap->n;
579       upTriFactor->csrMat->num_cols = A->cmap->n;
580       upTriFactor->csrMat->num_entries = a->nz;
581 
582       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
583       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
584 
585       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
586       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
587 
588       upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
589       upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
590 
591       /* perform the solve analysis */
592       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
593                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
594                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
595                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSP(stat);
596 
597       /* assign the pointer. Is this really necessary? */
598       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
599 
600       /* allocate space for the triangular factor information */
601       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
602 
603       /* Create the matrix description */
604       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSP(stat);
605       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat);
606       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat);
607       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat);
608       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat);
609 
610       /* Create the solve analysis information */
611       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSP(stat);
612 
613       /* set the operation */
614       loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
615 
616       /* set the matrix */
617       loTriFactor->csrMat = new CsrMatrix;
618       loTriFactor->csrMat->num_rows = A->rmap->n;
619       loTriFactor->csrMat->num_cols = A->cmap->n;
620       loTriFactor->csrMat->num_entries = a->nz;
621 
622       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
623       loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
624 
625       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
626       loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
627 
628       loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
629       loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
630 
631       /* perform the solve analysis */
632       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
633                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
634                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
635                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSP(stat);
636 
637       /* assign the pointer. Is this really necessary? */
638       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
639 
640       A->valid_GPU_matrix = PETSC_CUSP_BOTH;
641       ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr);
642       ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr);
643       ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr);
644       ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr);
645     } catch(char *ex) {
646       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
647     }
648   }
649   PetscFunctionReturn(0);
650 }
651 
652 #undef __FUNCT__
653 #define __FUNCT__ "MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU"
654 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
655 {
656   PetscErrorCode               ierr;
657   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
658   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
659   IS                           ip = a->row;
660   const PetscInt               *rip;
661   PetscBool                    perm_identity;
662   PetscInt                     n = A->rmap->n;
663 
664   PetscFunctionBegin;
665   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
666   cusparseTriFactors->workVector = new THRUSTARRAY;
667   cusparseTriFactors->workVector->resize(n);
668   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
669 
670   /*lower triangular indices */
671   ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
672   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
673   if (!perm_identity) {
674     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
675     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
676     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
677     cusparseTriFactors->cpermIndices->assign(rip, rip+n);
678   }
679   ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
680   PetscFunctionReturn(0);
681 }
682 
683 #undef __FUNCT__
684 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJCUSPARSE"
685 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
686 {
687   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
688   IS             isrow = b->row,iscol = b->col;
689   PetscBool      row_identity,col_identity;
690   PetscErrorCode ierr;
691 
692   PetscFunctionBegin;
693   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
694   /* determine which version of MatSolve needs to be used. */
695   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
696   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
697   if (row_identity && col_identity) {
698     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
699     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
700   } else {
701     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
702     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
703   }
704 
705   /* get the triangular factors */
706   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
707   PetscFunctionReturn(0);
708 }
709 
710 #undef __FUNCT__
711 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqAIJCUSPARSE"
712 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
713 {
714   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
715   IS             ip = b->row;
716   PetscBool      perm_identity;
717   PetscErrorCode ierr;
718 
719   PetscFunctionBegin;
720   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
721 
722   /* determine which version of MatSolve needs to be used. */
723   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
724   if (perm_identity) {
725     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
726     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
727   } else {
728     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
729     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
730   }
731 
732   /* get the triangular factors */
733   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
734   PetscFunctionReturn(0);
735 }
736 
737 #undef __FUNCT__
738 #define __FUNCT__ "MatSeqAIJCUSPARSEAnalyzeTransposeForSolve"
739 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
740 {
741   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
742   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
743   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
744   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
745   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
746   cusparseStatus_t                  stat;
747   cusparseIndexBase_t               indexBase;
748   cusparseMatrixType_t              matrixType;
749   cusparseFillMode_t                fillMode;
750   cusparseDiagType_t                diagType;
751 
752   PetscFunctionBegin;
753 
754   /*********************************************/
755   /* Now the Transpose of the Lower Tri Factor */
756   /*********************************************/
757 
758   /* allocate space for the transpose of the lower triangular factor */
759   loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
760 
761   /* set the matrix descriptors of the lower triangular factor */
762   matrixType = cusparseGetMatType(loTriFactor->descr);
763   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
764   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
765     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
766   diagType = cusparseGetMatDiagType(loTriFactor->descr);
767 
768   /* Create the matrix description */
769   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSP(stat);
770   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSP(stat);
771   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSP(stat);
772   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSP(stat);
773   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSP(stat);
774 
775   /* Create the solve analysis information */
776   stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUSP(stat);
777 
778   /* set the operation */
779   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
780 
781   /* allocate GPU space for the CSC of the lower triangular factor*/
782   loTriFactorT->csrMat = new CsrMatrix;
783   loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows;
784   loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols;
785   loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
786   loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1);
787   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries);
788   loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries);
789 
790   /* compute the transpose of the lower triangular factor, i.e. the CSC */
791   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
792                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
793                           loTriFactor->csrMat->values->data().get(),
794                           loTriFactor->csrMat->row_offsets->data().get(),
795                           loTriFactor->csrMat->column_indices->data().get(),
796                           loTriFactorT->csrMat->values->data().get(),
797                           loTriFactorT->csrMat->column_indices->data().get(),
798                           loTriFactorT->csrMat->row_offsets->data().get(),
799                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat);
800 
801   /* perform the solve analysis on the transposed matrix */
802   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
803                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries,
804                            loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
805                            loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(),
806                            loTriFactorT->solveInfo);CHKERRCUSP(stat);
807 
808   /* assign the pointer. Is this really necessary? */
809   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
810 
811   /*********************************************/
812   /* Now the Transpose of the Upper Tri Factor */
813   /*********************************************/
814 
815   /* allocate space for the transpose of the upper triangular factor */
816   upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
817 
818   /* set the matrix descriptors of the upper triangular factor */
819   matrixType = cusparseGetMatType(upTriFactor->descr);
820   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
821   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
822     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
823   diagType = cusparseGetMatDiagType(upTriFactor->descr);
824 
825   /* Create the matrix description */
826   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSP(stat);
827   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSP(stat);
828   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSP(stat);
829   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSP(stat);
830   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSP(stat);
831 
832   /* Create the solve analysis information */
833   stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUSP(stat);
834 
835   /* set the operation */
836   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
837 
838   /* allocate GPU space for the CSC of the upper triangular factor*/
839   upTriFactorT->csrMat = new CsrMatrix;
840   upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows;
841   upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols;
842   upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
843   upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1);
844   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries);
845   upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries);
846 
847   /* compute the transpose of the upper triangular factor, i.e. the CSC */
848   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
849                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
850                           upTriFactor->csrMat->values->data().get(),
851                           upTriFactor->csrMat->row_offsets->data().get(),
852                           upTriFactor->csrMat->column_indices->data().get(),
853                           upTriFactorT->csrMat->values->data().get(),
854                           upTriFactorT->csrMat->column_indices->data().get(),
855                           upTriFactorT->csrMat->row_offsets->data().get(),
856                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat);
857 
858   /* perform the solve analysis on the transposed matrix */
859   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
860                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries,
861                            upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
862                            upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(),
863                            upTriFactorT->solveInfo);CHKERRCUSP(stat);
864 
865   /* assign the pointer. Is this really necessary? */
866   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
867   PetscFunctionReturn(0);
868 }
869 
870 #undef __FUNCT__
871 #define __FUNCT__ "MatSeqAIJCUSPARSEGenerateTransposeForMult"
872 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
873 {
874   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
875   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
876   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
877   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
878   cusparseStatus_t             stat;
879   cusparseIndexBase_t          indexBase;
880   cudaError_t                  err;
881 
882   PetscFunctionBegin;
883 
884   /* allocate space for the triangular factor information */
885   matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
886   stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSP(stat);
887   indexBase = cusparseGetMatIndexBase(matstruct->descr);
888   stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSP(stat);
889   stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSP(stat);
890 
891   /* set alpha and beta */
892   err = cudaMalloc((void **)&(matstructT->alpha),sizeof(PetscScalar));CHKERRCUSP(err);
893   err = cudaMemcpy(matstructT->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err);
894   err = cudaMalloc((void **)&(matstructT->beta),sizeof(PetscScalar));CHKERRCUSP(err);
895   err = cudaMemcpy(matstructT->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err);
896   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSP(stat);
897 
898   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
899     CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
900     CsrMatrix *matrixT= new CsrMatrix;
901     matrixT->num_rows = A->rmap->n;
902     matrixT->num_cols = A->cmap->n;
903     matrixT->num_entries = a->nz;
904     matrixT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
905     matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
906     matrixT->values = new THRUSTARRAY(a->nz);
907 
908     /* compute the transpose of the upper triangular factor, i.e. the CSC */
909     indexBase = cusparseGetMatIndexBase(matstruct->descr);
910     stat = cusparse_csr2csc(cusparsestruct->handle, matrix->num_rows,
911                             matrix->num_cols, matrix->num_entries,
912                             matrix->values->data().get(),
913                             matrix->row_offsets->data().get(),
914                             matrix->column_indices->data().get(),
915                             matrixT->values->data().get(),
916                             matrixT->column_indices->data().get(),
917                             matrixT->row_offsets->data().get(),
918                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat);
919 
920     /* assign the pointer */
921     matstructT->mat = matrixT;
922 
923   } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
924 #if CUDA_VERSION>=5000
925     /* First convert HYB to CSR */
926     CsrMatrix *temp= new CsrMatrix;
927     temp->num_rows = A->rmap->n;
928     temp->num_cols = A->cmap->n;
929     temp->num_entries = a->nz;
930     temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
931     temp->column_indices = new THRUSTINTARRAY32(a->nz);
932     temp->values = new THRUSTARRAY(a->nz);
933 
934 
935     stat = cusparse_hyb2csr(cusparsestruct->handle,
936                             matstruct->descr, (cusparseHybMat_t)matstruct->mat,
937                             temp->values->data().get(),
938                             temp->row_offsets->data().get(),
939                             temp->column_indices->data().get());CHKERRCUSP(stat);
940 
941     /* Next, convert CSR to CSC (i.e. the matrix transpose) */
942     CsrMatrix *tempT= new CsrMatrix;
943     tempT->num_rows = A->rmap->n;
944     tempT->num_cols = A->cmap->n;
945     tempT->num_entries = a->nz;
946     tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
947     tempT->column_indices = new THRUSTINTARRAY32(a->nz);
948     tempT->values = new THRUSTARRAY(a->nz);
949 
950     stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
951                             temp->num_cols, temp->num_entries,
952                             temp->values->data().get(),
953                             temp->row_offsets->data().get(),
954                             temp->column_indices->data().get(),
955                             tempT->values->data().get(),
956                             tempT->column_indices->data().get(),
957                             tempT->row_offsets->data().get(),
958                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat);
959 
960     /* Last, convert CSC to HYB */
961     cusparseHybMat_t hybMat;
962     stat = cusparseCreateHybMat(&hybMat);CHKERRCUSP(stat);
963     cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
964       CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
965     stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
966                             matstructT->descr, tempT->values->data().get(),
967                             tempT->row_offsets->data().get(),
968                             tempT->column_indices->data().get(),
969                             hybMat, 0, partition);CHKERRCUSP(stat);
970 
971     /* assign the pointer */
972     matstructT->mat = hybMat;
973 
974     /* delete temporaries */
975     if (tempT) {
976       if (tempT->values) delete (THRUSTARRAY*) tempT->values;
977       if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
978       if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
979       delete (CsrMatrix*) tempT;
980     }
981     if (temp) {
982       if (temp->values) delete (THRUSTARRAY*) temp->values;
983       if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
984       if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
985       delete (CsrMatrix*) temp;
986     }
987 #else
988     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format for the Matrix Transpose (in MatMultTranspose) require CUDA 5.0 or later.");
989 #endif
990   }
991   /* assign the compressed row indices */
992   matstructT->cprowIndices = new THRUSTINTARRAY;
993 
994   /* assign the pointer */
995   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
996   PetscFunctionReturn(0);
997 }
998 
999 #undef __FUNCT__
1000 #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE"
1001 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1002 {
1003   CUSPARRAY                         *xGPU, *bGPU;
1004   cusparseStatus_t                  stat;
1005   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1006   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1007   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1008   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1009   PetscErrorCode                    ierr;
1010 
1011   PetscFunctionBegin;
1012   /* Analyze the matrix and create the transpose ... on the fly */
1013   if (!loTriFactorT && !upTriFactorT) {
1014     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1015     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1016     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1017   }
1018 
1019   /* Get the GPU pointers */
1020   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1021   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
1022 
1023   /* First, reorder with the row permutation */
1024   thrust::copy(thrust::make_permutation_iterator(bGPU->begin(), cusparseTriFactors->rpermIndices->begin()),
1025                thrust::make_permutation_iterator(bGPU->end(), cusparseTriFactors->rpermIndices->end()),
1026                xGPU->begin());
1027 
1028   /* First, solve U */
1029   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1030                         upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr,
1031                         upTriFactorT->csrMat->values->data().get(),
1032                         upTriFactorT->csrMat->row_offsets->data().get(),
1033                         upTriFactorT->csrMat->column_indices->data().get(),
1034                         upTriFactorT->solveInfo,
1035                         xGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat);
1036 
1037   /* Then, solve L */
1038   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1039                         loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr,
1040                         loTriFactorT->csrMat->values->data().get(),
1041                         loTriFactorT->csrMat->row_offsets->data().get(),
1042                         loTriFactorT->csrMat->column_indices->data().get(),
1043                         loTriFactorT->solveInfo,
1044                         tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat);
1045 
1046   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1047   thrust::copy(thrust::make_permutation_iterator(xGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1048                thrust::make_permutation_iterator(xGPU->end(), cusparseTriFactors->cpermIndices->end()),
1049                tempGPU->begin());
1050 
1051   /* Copy the temporary to the full solution. */
1052   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU->begin());
1053 
1054   /* restore */
1055   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
1056   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1057   ierr = WaitForGPU();CHKERRCUSP(ierr);
1058 
1059   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1060   PetscFunctionReturn(0);
1061 }
1062 
1063 #undef __FUNCT__
1064 #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering"
1065 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1066 {
1067   CUSPARRAY                         *xGPU,*bGPU;
1068   cusparseStatus_t                  stat;
1069   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1070   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1071   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1072   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1073   PetscErrorCode                    ierr;
1074 
1075   PetscFunctionBegin;
1076   /* Analyze the matrix and create the transpose ... on the fly */
1077   if (!loTriFactorT && !upTriFactorT) {
1078     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1079     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1080     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1081   }
1082 
1083   /* Get the GPU pointers */
1084   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1085   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
1086 
1087   /* First, solve U */
1088   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1089                         upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr,
1090                         upTriFactorT->csrMat->values->data().get(),
1091                         upTriFactorT->csrMat->row_offsets->data().get(),
1092                         upTriFactorT->csrMat->column_indices->data().get(),
1093                         upTriFactorT->solveInfo,
1094                         bGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat);
1095 
1096   /* Then, solve L */
1097   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1098                         loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr,
1099                         loTriFactorT->csrMat->values->data().get(),
1100                         loTriFactorT->csrMat->row_offsets->data().get(),
1101                         loTriFactorT->csrMat->column_indices->data().get(),
1102                         loTriFactorT->solveInfo,
1103                         tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat);
1104 
1105   /* restore */
1106   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
1107   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1108   ierr = WaitForGPU();CHKERRCUSP(ierr);
1109   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1110   PetscFunctionReturn(0);
1111 }
1112 
1113 #undef __FUNCT__
1114 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE"
1115 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1116 {
1117   CUSPARRAY                         *xGPU,*bGPU;
1118   cusparseStatus_t                  stat;
1119   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1120   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1121   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1122   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1123   PetscErrorCode                    ierr;
1124   VecType                           t;
1125   PetscBool                         flg;
1126 
1127   PetscFunctionBegin;
1128   ierr = VecGetType(bb,&t);CHKERRQ(ierr);
1129   ierr = PetscStrcmp(t,VECSEQCUSP,&flg);CHKERRQ(ierr);
1130   if (!flg) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Vector of type %s passed into MatSolve_SeqAIJCUSPARSE (Arg #2). Can only deal with %s\n.",t,VECSEQCUSP);
1131   ierr = VecGetType(xx,&t);CHKERRQ(ierr);
1132   ierr = PetscStrcmp(t,VECSEQCUSP,&flg);CHKERRQ(ierr);
1133   if (!flg) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Vector of type %s passed into MatSolve_SeqAIJCUSPARSE (Arg #3). Can only deal with %s\n.",t,VECSEQCUSP);
1134 
1135   /* Get the GPU pointers */
1136   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1137   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
1138 
1139   /* First, reorder with the row permutation */
1140   thrust::copy(thrust::make_permutation_iterator(bGPU->begin(), cusparseTriFactors->rpermIndices->begin()),
1141                thrust::make_permutation_iterator(bGPU->end(), cusparseTriFactors->rpermIndices->end()),
1142                xGPU->begin());
1143 
1144   /* Next, solve L */
1145   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1146                         loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr,
1147                         loTriFactor->csrMat->values->data().get(),
1148                         loTriFactor->csrMat->row_offsets->data().get(),
1149                         loTriFactor->csrMat->column_indices->data().get(),
1150                         loTriFactor->solveInfo,
1151                         xGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat);
1152 
1153   /* Then, solve U */
1154   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1155                         upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr,
1156                         upTriFactor->csrMat->values->data().get(),
1157                         upTriFactor->csrMat->row_offsets->data().get(),
1158                         upTriFactor->csrMat->column_indices->data().get(),
1159                         upTriFactor->solveInfo,
1160                         tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat);
1161 
1162   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1163   thrust::copy(thrust::make_permutation_iterator(xGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1164                thrust::make_permutation_iterator(xGPU->end(), cusparseTriFactors->cpermIndices->end()),
1165                tempGPU->begin());
1166 
1167   /* Copy the temporary to the full solution. */
1168   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU->begin());
1169 
1170   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
1171   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1172   ierr = WaitForGPU();CHKERRCUSP(ierr);
1173   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1174   PetscFunctionReturn(0);
1175 }
1176 
1177 #undef __FUNCT__
1178 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE_NaturalOrdering"
1179 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1180 {
1181   CUSPARRAY                         *xGPU,*bGPU;
1182   cusparseStatus_t                  stat;
1183   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1184   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1185   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1186   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1187   PetscErrorCode                    ierr;
1188 
1189   PetscFunctionBegin;
1190   /* Get the GPU pointers */
1191   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1192   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
1193 
1194   /* First, solve L */
1195   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1196                         loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr,
1197                         loTriFactor->csrMat->values->data().get(),
1198                         loTriFactor->csrMat->row_offsets->data().get(),
1199                         loTriFactor->csrMat->column_indices->data().get(),
1200                         loTriFactor->solveInfo,
1201                         bGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat);
1202 
1203   /* Next, solve U */
1204   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1205                         upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr,
1206                         upTriFactor->csrMat->values->data().get(),
1207                         upTriFactor->csrMat->row_offsets->data().get(),
1208                         upTriFactor->csrMat->column_indices->data().get(),
1209                         upTriFactor->solveInfo,
1210                         tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat);
1211 
1212   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
1213   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1214   ierr = WaitForGPU();CHKERRCUSP(ierr);
1215   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1216   PetscFunctionReturn(0);
1217 }
1218 
1219 #undef __FUNCT__
1220 #define __FUNCT__ "MatSeqAIJCUSPARSECopyToGPU"
1221 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1222 {
1223 
1224   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1225   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1226   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1227   PetscInt                     m = A->rmap->n,*ii,*ridx;
1228   PetscErrorCode               ierr;
1229   cusparseStatus_t             stat;
1230   cudaError_t                  err;
1231 
1232   PetscFunctionBegin;
1233   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
1234     ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1235     Mat_SeqAIJCUSPARSEMultStruct_Destroy(&matstruct,cusparsestruct->format);
1236     try {
1237       cusparsestruct->nonzerorow=0;
1238       for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0);
1239 
1240       if (a->compressedrow.use) {
1241         m    = a->compressedrow.nrows;
1242         ii   = a->compressedrow.i;
1243         ridx = a->compressedrow.rindex;
1244       } else {
1245         /* Forcing compressed row on the GPU */
1246         int k=0;
1247         ierr = PetscMalloc1(cusparsestruct->nonzerorow+1, &ii);CHKERRQ(ierr);
1248         ierr = PetscMalloc1(cusparsestruct->nonzerorow, &ridx);CHKERRQ(ierr);
1249         ii[0]=0;
1250         for (int j = 0; j<m; j++) {
1251           if ((a->i[j+1]-a->i[j])>0) {
1252             ii[k]  = a->i[j];
1253             ridx[k]= j;
1254             k++;
1255           }
1256         }
1257         ii[cusparsestruct->nonzerorow] = a->nz;
1258         m = cusparsestruct->nonzerorow;
1259       }
1260 
1261       /* allocate space for the triangular factor information */
1262       matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1263       stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSP(stat);
1264       stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat);
1265       stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSP(stat);
1266 
1267       err = cudaMalloc((void **)&(matstruct->alpha),sizeof(PetscScalar));CHKERRCUSP(err);
1268       err = cudaMemcpy(matstruct->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err);
1269       err = cudaMalloc((void **)&(matstruct->beta),sizeof(PetscScalar));CHKERRCUSP(err);
1270       err = cudaMemcpy(matstruct->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err);
1271       stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSP(stat);
1272 
1273       /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1274       if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1275 /* set the matrix */
1276         CsrMatrix *matrix= new CsrMatrix;
1277         matrix->num_rows = m;
1278         matrix->num_cols = A->cmap->n;
1279         matrix->num_entries = a->nz;
1280         matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1281         matrix->row_offsets->assign(ii, ii + m+1);
1282 
1283         matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1284         matrix->column_indices->assign(a->j, a->j+a->nz);
1285 
1286         matrix->values = new THRUSTARRAY(a->nz);
1287         matrix->values->assign(a->a, a->a+a->nz);
1288 
1289 /* assign the pointer */
1290         matstruct->mat = matrix;
1291 
1292       } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1293 #if CUDA_VERSION>=4020
1294         CsrMatrix *matrix= new CsrMatrix;
1295         matrix->num_rows = m;
1296         matrix->num_cols = A->cmap->n;
1297         matrix->num_entries = a->nz;
1298         matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1299         matrix->row_offsets->assign(ii, ii + m+1);
1300 
1301         matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1302         matrix->column_indices->assign(a->j, a->j+a->nz);
1303 
1304         matrix->values = new THRUSTARRAY(a->nz);
1305         matrix->values->assign(a->a, a->a+a->nz);
1306 
1307         cusparseHybMat_t hybMat;
1308         stat = cusparseCreateHybMat(&hybMat);CHKERRCUSP(stat);
1309         cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1310           CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1311         stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols,
1312                                 matstruct->descr, matrix->values->data().get(),
1313                                 matrix->row_offsets->data().get(),
1314                                 matrix->column_indices->data().get(),
1315                                 hybMat, 0, partition);CHKERRCUSP(stat);
1316         /* assign the pointer */
1317         matstruct->mat = hybMat;
1318 
1319         if (matrix) {
1320           if (matrix->values) delete (THRUSTARRAY*)matrix->values;
1321           if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices;
1322           if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets;
1323           delete (CsrMatrix*)matrix;
1324         }
1325 #endif
1326       }
1327 
1328       /* assign the compressed row indices */
1329       matstruct->cprowIndices = new THRUSTINTARRAY(m);
1330       matstruct->cprowIndices->assign(ridx,ridx+m);
1331 
1332       /* assign the pointer */
1333       cusparsestruct->mat = matstruct;
1334 
1335       if (!a->compressedrow.use) {
1336         ierr = PetscFree(ii);CHKERRQ(ierr);
1337         ierr = PetscFree(ridx);CHKERRQ(ierr);
1338       }
1339       cusparsestruct->workVector = new THRUSTARRAY;
1340       cusparsestruct->workVector->resize(m);
1341     } catch(char *ex) {
1342       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1343     }
1344     ierr = WaitForGPU();CHKERRCUSP(ierr);
1345 
1346     A->valid_GPU_matrix = PETSC_CUSP_BOTH;
1347 
1348     ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1349   }
1350   PetscFunctionReturn(0);
1351 }
1352 
1353 #undef __FUNCT__
1354 #define __FUNCT__ "MatCreateVecs_SeqAIJCUSPARSE"
1355 static PetscErrorCode MatCreateVecs_SeqAIJCUSPARSE(Mat mat, Vec *right, Vec *left)
1356 {
1357   PetscErrorCode ierr;
1358   PetscInt rbs,cbs;
1359 
1360   PetscFunctionBegin;
1361   ierr = MatGetBlockSizes(mat,&rbs,&cbs);CHKERRQ(ierr);
1362   if (right) {
1363     ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr);
1364     ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
1365     ierr = VecSetBlockSize(*right,cbs);CHKERRQ(ierr);
1366     ierr = VecSetType(*right,VECSEQCUSP);CHKERRQ(ierr);
1367     ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr);
1368   }
1369   if (left) {
1370     ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr);
1371     ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
1372     ierr = VecSetBlockSize(*left,rbs);CHKERRQ(ierr);
1373     ierr = VecSetType(*left,VECSEQCUSP);CHKERRQ(ierr);
1374     ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr);
1375   }
1376   PetscFunctionReturn(0);
1377 }
1378 
1379 struct VecCUSPPlusEquals
1380 {
1381   template <typename Tuple>
1382   __host__ __device__
1383   void operator()(Tuple t)
1384   {
1385     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1386   }
1387 };
1388 
1389 #undef __FUNCT__
1390 #define __FUNCT__ "MatMult_SeqAIJCUSPARSE"
1391 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1392 {
1393   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1394   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1395   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1396   CUSPARRAY                    *xarray,*yarray;
1397   PetscErrorCode               ierr;
1398   cusparseStatus_t             stat;
1399 
1400   PetscFunctionBegin;
1401   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
1402      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
1403   ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1404   ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
1405   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1406     CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1407     stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1408                              mat->num_rows, mat->num_cols, mat->num_entries,
1409                              matstruct->alpha, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(),
1410                              mat->column_indices->data().get(), xarray->data().get(), matstruct->beta,
1411                              yarray->data().get());CHKERRCUSP(stat);
1412   } else {
1413 #if CUDA_VERSION>=4020
1414     cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1415     stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1416                              matstruct->alpha, matstruct->descr, hybMat,
1417                              xarray->data().get(), matstruct->beta,
1418                              yarray->data().get());CHKERRCUSP(stat);
1419 #endif
1420   }
1421   ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1422   ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
1423   if (!cusparsestruct->stream) {
1424     ierr = WaitForGPU();CHKERRCUSP(ierr);
1425   }
1426   ierr = PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr);
1427   PetscFunctionReturn(0);
1428 }
1429 
1430 #undef __FUNCT__
1431 #define __FUNCT__ "MatMultTranspose_SeqAIJCUSPARSE"
1432 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1433 {
1434   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1435   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1436   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1437   CUSPARRAY                    *xarray,*yarray;
1438   PetscErrorCode               ierr;
1439   cusparseStatus_t             stat;
1440 
1441   PetscFunctionBegin;
1442   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
1443      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
1444   if (!matstructT) {
1445     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1446     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1447   }
1448   ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1449   ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
1450 
1451   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1452     CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1453     stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1454                              mat->num_rows, mat->num_cols,
1455                              mat->num_entries, matstructT->alpha, matstructT->descr,
1456                              mat->values->data().get(), mat->row_offsets->data().get(),
1457                              mat->column_indices->data().get(), xarray->data().get(), matstructT->beta,
1458                              yarray->data().get());CHKERRCUSP(stat);
1459   } else {
1460 #if CUDA_VERSION>=4020
1461     cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1462     stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1463                              matstructT->alpha, matstructT->descr, hybMat,
1464                              xarray->data().get(), matstructT->beta,
1465                              yarray->data().get());CHKERRCUSP(stat);
1466 #endif
1467   }
1468   ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1469   ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
1470   if (!cusparsestruct->stream) {
1471     ierr = WaitForGPU();CHKERRCUSP(ierr);
1472   }
1473   ierr = PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr);
1474   PetscFunctionReturn(0);
1475 }
1476 
1477 
1478 #undef __FUNCT__
1479 #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE"
1480 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1481 {
1482   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1483   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1484   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1485   CUSPARRAY                    *xarray,*yarray,*zarray;
1486   PetscErrorCode               ierr;
1487   cusparseStatus_t             stat;
1488 
1489   PetscFunctionBegin;
1490   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
1491      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
1492   try {
1493     ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr);
1494     ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1495     ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr);
1496     ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
1497 
1498     /* multiply add */
1499     if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1500       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1501     /* here we need to be careful to set the number of rows in the multiply to the
1502        number of compressed rows in the matrix ... which is equivalent to the
1503        size of the workVector */
1504       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1505                                mat->num_rows, mat->num_cols,
1506                                mat->num_entries, matstruct->alpha, matstruct->descr,
1507                                mat->values->data().get(), mat->row_offsets->data().get(),
1508                                mat->column_indices->data().get(), xarray->data().get(), matstruct->beta,
1509                                cusparsestruct->workVector->data().get());CHKERRCUSP(stat);
1510     } else {
1511 #if CUDA_VERSION>=4020
1512       cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1513       if (cusparsestruct->workVector->size()) {
1514         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1515             matstruct->alpha, matstruct->descr, hybMat,
1516             xarray->data().get(), matstruct->beta,
1517             cusparsestruct->workVector->data().get());CHKERRCUSP(stat);
1518       }
1519 #endif
1520     }
1521 
1522     /* scatter the data from the temporary into the full vector with a += operation */
1523     thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstruct->cprowIndices->begin()))),
1524         thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(),
1525         VecCUSPPlusEquals());
1526     ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1527     ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr);
1528     ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
1529 
1530   } catch(char *ex) {
1531     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1532   }
1533   ierr = WaitForGPU();CHKERRCUSP(ierr);
1534   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1535   PetscFunctionReturn(0);
1536 }
1537 
1538 #undef __FUNCT__
1539 #define __FUNCT__ "MatMultTransposeAdd_SeqAIJCUSPARSE"
1540 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1541 {
1542   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1543   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1544   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1545   CUSPARRAY                    *xarray,*yarray,*zarray;
1546   PetscErrorCode               ierr;
1547   cusparseStatus_t             stat;
1548 
1549   PetscFunctionBegin;
1550   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
1551      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
1552   if (!matstructT) {
1553     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1554     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1555   }
1556 
1557   try {
1558     ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr);
1559     ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1560     ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr);
1561     ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
1562 
1563     /* multiply add with matrix transpose */
1564     if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1565       CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1566       /* here we need to be careful to set the number of rows in the multiply to the
1567          number of compressed rows in the matrix ... which is equivalent to the
1568          size of the workVector */
1569       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1570                                mat->num_rows, mat->num_cols,
1571                                mat->num_entries, matstructT->alpha, matstructT->descr,
1572                                mat->values->data().get(), mat->row_offsets->data().get(),
1573                                mat->column_indices->data().get(), xarray->data().get(), matstructT->beta,
1574                                cusparsestruct->workVector->data().get());CHKERRCUSP(stat);
1575     } else {
1576 #if CUDA_VERSION>=4020
1577       cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1578       if (cusparsestruct->workVector->size()) {
1579         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1580             matstructT->alpha, matstructT->descr, hybMat,
1581             xarray->data().get(), matstructT->beta,
1582             cusparsestruct->workVector->data().get());CHKERRCUSP(stat);
1583       }
1584 #endif
1585     }
1586 
1587     /* scatter the data from the temporary into the full vector with a += operation */
1588     thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstructT->cprowIndices->begin()))),
1589         thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstructT->cprowIndices->begin()))) + cusparsestruct->workVector->size(),
1590         VecCUSPPlusEquals());
1591 
1592     ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1593     ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr);
1594     ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
1595 
1596   } catch(char *ex) {
1597     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1598   }
1599   ierr = WaitForGPU();CHKERRCUSP(ierr);
1600   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1601   PetscFunctionReturn(0);
1602 }
1603 
1604 #undef __FUNCT__
1605 #define __FUNCT__ "MatAssemblyEnd_SeqAIJCUSPARSE"
1606 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
1607 {
1608   PetscErrorCode ierr;
1609 
1610   PetscFunctionBegin;
1611   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
1612   if (A->factortype==MAT_FACTOR_NONE) {
1613     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1614   }
1615   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
1616   A->ops->mult             = MatMult_SeqAIJCUSPARSE;
1617   A->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1618   A->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1619   A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1620   PetscFunctionReturn(0);
1621 }
1622 
1623 /* --------------------------------------------------------------------------------*/
1624 #undef __FUNCT__
1625 #define __FUNCT__ "MatCreateSeqAIJCUSPARSE"
1626 /*@
1627    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
1628    (the default parallel PETSc format). This matrix will ultimately pushed down
1629    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
1630    assembly performance the user should preallocate the matrix storage by setting
1631    the parameter nz (or the array nnz).  By setting these parameters accurately,
1632    performance during matrix assembly can be increased by more than a factor of 50.
1633 
1634    Collective on MPI_Comm
1635 
1636    Input Parameters:
1637 +  comm - MPI communicator, set to PETSC_COMM_SELF
1638 .  m - number of rows
1639 .  n - number of columns
1640 .  nz - number of nonzeros per row (same for all rows)
1641 -  nnz - array containing the number of nonzeros in the various rows
1642          (possibly different for each row) or NULL
1643 
1644    Output Parameter:
1645 .  A - the matrix
1646 
1647    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1648    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1649    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
1650 
1651    Notes:
1652    If nnz is given then nz is ignored
1653 
1654    The AIJ format (also called the Yale sparse matrix format or
1655    compressed row storage), is fully compatible with standard Fortran 77
1656    storage.  That is, the stored row and column indices can begin at
1657    either one (as in Fortran) or zero.  See the users' manual for details.
1658 
1659    Specify the preallocated storage with either nz or nnz (not both).
1660    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1661    allocation.  For large problems you MUST preallocate memory or you
1662    will get TERRIBLE performance, see the users' manual chapter on matrices.
1663 
1664    By default, this format uses inodes (identical nodes) when possible, to
1665    improve numerical efficiency of matrix-vector products and solves. We
1666    search for consecutive rows with the same nonzero structure, thereby
1667    reusing matrix information to achieve increased efficiency.
1668 
1669    Level: intermediate
1670 
1671 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
1672 @*/
1673 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1674 {
1675   PetscErrorCode ierr;
1676 
1677   PetscFunctionBegin;
1678   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1679   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
1680   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1681   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
1682   PetscFunctionReturn(0);
1683 }
1684 
1685 #undef __FUNCT__
1686 #define __FUNCT__ "MatDestroy_SeqAIJCUSPARSE"
1687 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
1688 {
1689   PetscErrorCode   ierr;
1690 
1691   PetscFunctionBegin;
1692   if (A->factortype==MAT_FACTOR_NONE) {
1693     if (A->valid_GPU_matrix != PETSC_CUSP_UNALLOCATED) {
1694       ierr = Mat_SeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
1695     }
1696   } else {
1697     ierr = Mat_SeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
1698   }
1699   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
1700   PetscFunctionReturn(0);
1701 }
1702 
1703 #undef __FUNCT__
1704 #define __FUNCT__ "MatCreate_SeqAIJCUSPARSE"
1705 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
1706 {
1707   PetscErrorCode ierr;
1708   cusparseStatus_t stat;
1709   cusparseHandle_t handle=0;
1710 
1711   PetscFunctionBegin;
1712   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
1713   if (B->factortype==MAT_FACTOR_NONE) {
1714     /* you cannot check the inode.use flag here since the matrix was just created.
1715        now build a GPU matrix data structure */
1716     B->spptr = new Mat_SeqAIJCUSPARSE;
1717     ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat          = 0;
1718     ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0;
1719     ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector   = 0;
1720     ((Mat_SeqAIJCUSPARSE*)B->spptr)->format       = MAT_CUSPARSE_CSR;
1721     ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream       = 0;
1722     ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle       = 0;
1723     stat = cusparseCreate(&handle);CHKERRCUSP(stat);
1724     ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle       = handle;
1725     ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream       = 0;
1726   } else {
1727     /* NEXT, set the pointers to the triangular factors */
1728     B->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1729     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr          = 0;
1730     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr          = 0;
1731     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0;
1732     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0;
1733     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices            = 0;
1734     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices            = 0;
1735     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector              = 0;
1736     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle                  = 0;
1737     stat = cusparseCreate(&handle);CHKERRCUSP(stat);
1738     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle                  = handle;
1739     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz                     = 0;
1740   }
1741 
1742   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
1743   B->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
1744   B->ops->getvecs          = MatCreateVecs_SeqAIJCUSPARSE;
1745   B->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
1746   B->ops->mult             = MatMult_SeqAIJCUSPARSE;
1747   B->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1748   B->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1749   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1750 
1751   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1752 
1753   B->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED;
1754 
1755   ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
1756   PetscFunctionReturn(0);
1757 }
1758 
1759 /*M
1760    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
1761 
1762    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
1763    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
1764    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
1765 
1766    Options Database Keys:
1767 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
1768 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1769 .  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1770 
1771   Level: beginner
1772 
1773 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
1774 M*/
1775 
1776 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);
1777 
1778 
1779 #undef __FUNCT__
1780 #define __FUNCT__ "MatSolverPackageRegister_CUSPARSE"
1781 PETSC_EXTERN PetscErrorCode MatSolverPackageRegister_CUSPARSE(void)
1782 {
1783   PetscErrorCode ierr;
1784 
1785   PetscFunctionBegin;
1786   ierr = MatSolverPackageRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1787   ierr = MatSolverPackageRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1788   ierr = MatSolverPackageRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1789   ierr = MatSolverPackageRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1790   PetscFunctionReturn(0);
1791 }
1792 
1793 
1794 #undef __FUNCT__
1795 #define __FUNCT__ "Mat_SeqAIJCUSPARSE_Destroy"
1796 static PetscErrorCode Mat_SeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
1797 {
1798   cusparseStatus_t stat;
1799   cusparseHandle_t handle;
1800 
1801   PetscFunctionBegin;
1802   if (*cusparsestruct) {
1803     Mat_SeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);
1804     Mat_SeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);
1805     delete (*cusparsestruct)->workVector;
1806     if (handle = (*cusparsestruct)->handle) {
1807       stat = cusparseDestroy(handle);CHKERRCUSP(stat);
1808     }
1809     delete *cusparsestruct;
1810     *cusparsestruct = 0;
1811   }
1812   PetscFunctionReturn(0);
1813 }
1814 
1815 #undef __FUNCT__
1816 #define __FUNCT__ "CsrMatrix_Destroy"
1817 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
1818 {
1819   PetscFunctionBegin;
1820   if (*mat) {
1821     delete (*mat)->values;
1822     delete (*mat)->column_indices;
1823     delete (*mat)->row_offsets;
1824     delete *mat;
1825     *mat = 0;
1826   }
1827   PetscFunctionReturn(0);
1828 }
1829 
1830 #undef __FUNCT__
1831 #define __FUNCT__ "Mat_SeqAIJCUSPARSETriFactorStruct_Destroy"
1832 static PetscErrorCode Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
1833 {
1834   cusparseStatus_t stat;
1835   PetscErrorCode   ierr;
1836 
1837   PetscFunctionBegin;
1838   if (*trifactor) {
1839     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSP(stat); }
1840     if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUSP(stat); }
1841     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
1842     delete *trifactor;
1843     *trifactor = 0;
1844   }
1845   PetscFunctionReturn(0);
1846 }
1847 
1848 #undef __FUNCT__
1849 #define __FUNCT__ "Mat_SeqAIJCUSPARSEMultStruct_Destroy"
1850 static PetscErrorCode Mat_SeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
1851 {
1852   CsrMatrix        *mat;
1853   cusparseStatus_t stat;
1854   cudaError_t      err;
1855 
1856   PetscFunctionBegin;
1857   if (*matstruct) {
1858     if ((*matstruct)->mat) {
1859       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
1860         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
1861         stat = cusparseDestroyHybMat(hybMat);CHKERRCUSP(stat);
1862       } else {
1863         mat = (CsrMatrix*)(*matstruct)->mat;
1864         CsrMatrix_Destroy(&mat);
1865       }
1866     }
1867     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSP(stat); }
1868     delete (*matstruct)->cprowIndices;
1869     if ((*matstruct)->alpha) { err=cudaFree((*matstruct)->alpha);CHKERRCUSP(err); }
1870     if ((*matstruct)->beta) { err=cudaFree((*matstruct)->beta);CHKERRCUSP(err); }
1871     delete *matstruct;
1872     *matstruct = 0;
1873   }
1874   PetscFunctionReturn(0);
1875 }
1876 
1877 #undef __FUNCT__
1878 #define __FUNCT__ "Mat_SeqAIJCUSPARSETriFactors_Destroy"
1879 static PetscErrorCode Mat_SeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
1880 {
1881   cusparseHandle_t handle;
1882   cusparseStatus_t stat;
1883 
1884   PetscFunctionBegin;
1885   if (*trifactors) {
1886     Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->loTriFactorPtr);
1887     Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->upTriFactorPtr);
1888     Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);
1889     Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);
1890     delete (*trifactors)->rpermIndices;
1891     delete (*trifactors)->cpermIndices;
1892     delete (*trifactors)->workVector;
1893     if (handle = (*trifactors)->handle) {
1894       stat = cusparseDestroy(handle);CHKERRCUSP(stat);
1895     }
1896     delete *trifactors;
1897     *trifactors = 0;
1898   }
1899   PetscFunctionReturn(0);
1900 }
1901 
1902