xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 966be33a19c9230d4aa438248a644248d45cc287)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 
7 #include <petscconf.h>
8 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
9 #include <../src/mat/impls/sbaij/seq/sbaij.h>
10 #include <../src/vec/vec/impls/dvecimpl.h>
11 #include <petsc/private/vecimpl.h>
12 #undef VecType
13 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
14 
15 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
16 
17 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
18 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
19 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
20 
21 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
22 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
23 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
24 
25 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
26 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
27 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
28 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
29 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
30 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
31 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
32 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
33 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
34 
35 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
36 static PetscErrorCode Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
37 static PetscErrorCode Mat_SeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
38 static PetscErrorCode Mat_SeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
39 static PetscErrorCode Mat_SeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
40 
41 #undef __FUNCT__
42 #define __FUNCT__ "MatCUSPARSESetStream"
43 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
44 {
45   cusparseStatus_t   stat;
46   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
47 
48   PetscFunctionBegin;
49   cusparsestruct->stream = stream;
50   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUDA(stat);
51   PetscFunctionReturn(0);
52 }
53 
54 #undef __FUNCT__
55 #define __FUNCT__ "MatCUSPARSESetHandle"
56 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
57 {
58   cusparseStatus_t   stat;
59   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
60 
61   PetscFunctionBegin;
62   if (cusparsestruct->handle)
63     stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUDA(stat);
64   cusparsestruct->handle = handle;
65   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
66   PetscFunctionReturn(0);
67 }
68 
69 #undef __FUNCT__
70 #define __FUNCT__ "MatCUSPARSEClearHandle"
71 PetscErrorCode MatCUSPARSEClearHandle(Mat A)
72 {
73   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
74   PetscFunctionBegin;
75   if (cusparsestruct->handle)
76     cusparsestruct->handle = 0;
77   PetscFunctionReturn(0);
78 }
79 
80 #undef __FUNCT__
81 #define __FUNCT__ "MatFactorGetSolverPackage_seqaij_cusparse"
82 PetscErrorCode MatFactorGetSolverPackage_seqaij_cusparse(Mat A,const MatSolverPackage *type)
83 {
84   PetscFunctionBegin;
85   *type = MATSOLVERCUSPARSE;
86   PetscFunctionReturn(0);
87 }
88 
89 /*MC
90   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
91   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
92   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
93   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
94   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
95   algorithms are not recommended. This class does NOT support direct solver operations.
96 
97   Level: beginner
98 
99 .seealso: PCFactorSetMatSolverPackage(), MatSolverPackage, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
100 M*/
101 
102 #undef __FUNCT__
103 #define __FUNCT__ "MatGetFactor_seqaijcusparse_cusparse"
104 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
105 {
106   PetscErrorCode ierr;
107   PetscInt       n = A->rmap->n;
108 
109   PetscFunctionBegin;
110   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
111   (*B)->factortype = ftype;
112   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
113   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
114 
115   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
116     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
117     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
118     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
119   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
120     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
121     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
122   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
123 
124   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
125   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverPackage_C",MatFactorGetSolverPackage_seqaij_cusparse);CHKERRQ(ierr);
126   PetscFunctionReturn(0);
127 }
128 
129 #undef __FUNCT__
130 #define __FUNCT__ "MatCUSPARSESetFormat_SeqAIJCUSPARSE"
131 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
132 {
133   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
134 
135   PetscFunctionBegin;
136 #if CUDA_VERSION>=4020
137   switch (op) {
138   case MAT_CUSPARSE_MULT:
139     cusparsestruct->format = format;
140     break;
141   case MAT_CUSPARSE_ALL:
142     cusparsestruct->format = format;
143     break;
144   default:
145     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
146   }
147 #else
148   if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format require CUDA 4.2 or later.");
149 #endif
150   PetscFunctionReturn(0);
151 }
152 
153 /*@
154    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
155    operation. Only the MatMult operation can use different GPU storage formats
156    for MPIAIJCUSPARSE matrices.
157    Not Collective
158 
159    Input Parameters:
160 +  A - Matrix of type SEQAIJCUSPARSE
161 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
162 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
163 
164    Output Parameter:
165 
166    Level: intermediate
167 
168 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
169 @*/
170 #undef __FUNCT__
171 #define __FUNCT__ "MatCUSPARSESetFormat"
172 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
173 {
174   PetscErrorCode ierr;
175 
176   PetscFunctionBegin;
177   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
178   ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
179   PetscFunctionReturn(0);
180 }
181 
182 #undef __FUNCT__
183 #define __FUNCT__ "MatSetFromOptions_SeqAIJCUSPARSE"
184 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
185 {
186   PetscErrorCode           ierr;
187   MatCUSPARSEStorageFormat format;
188   PetscBool                flg;
189   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
190 
191   PetscFunctionBegin;
192   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
193   ierr = PetscObjectOptionsBegin((PetscObject)A);
194   if (A->factortype==MAT_FACTOR_NONE) {
195     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
196                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
197     if (flg) {
198       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);
199     }
200   }
201   ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
202                           "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
203   if (flg) {
204     ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
205   }
206   ierr = PetscOptionsEnd();CHKERRQ(ierr);
207   PetscFunctionReturn(0);
208 
209 }
210 
211 #undef __FUNCT__
212 #define __FUNCT__ "MatILUFactorSymbolic_SeqAIJCUSPARSE"
213 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
214 {
215   PetscErrorCode ierr;
216 
217   PetscFunctionBegin;
218   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
219   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
220   PetscFunctionReturn(0);
221 }
222 
223 #undef __FUNCT__
224 #define __FUNCT__ "MatLUFactorSymbolic_SeqAIJCUSPARSE"
225 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
226 {
227   PetscErrorCode ierr;
228 
229   PetscFunctionBegin;
230   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
231   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
232   PetscFunctionReturn(0);
233 }
234 
235 #undef __FUNCT__
236 #define __FUNCT__ "MatICCFactorSymbolic_SeqAIJCUSPARSE"
237 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
238 {
239   PetscErrorCode ierr;
240 
241   PetscFunctionBegin;
242   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
243   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
244   PetscFunctionReturn(0);
245 }
246 
247 #undef __FUNCT__
248 #define __FUNCT__ "MatCholeskyFactorSymbolic_SeqAIJCUSPARSE"
249 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
250 {
251   PetscErrorCode ierr;
252 
253   PetscFunctionBegin;
254   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
255   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
256   PetscFunctionReturn(0);
257 }
258 
259 #undef __FUNCT__
260 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildILULowerTriMatrix"
261 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
262 {
263   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
264   PetscInt                          n = A->rmap->n;
265   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
266   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
267   cusparseStatus_t                  stat;
268   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
269   const MatScalar                   *aa = a->a,*v;
270   PetscInt                          *AiLo, *AjLo;
271   PetscScalar                       *AALo;
272   PetscInt                          i,nz, nzLower, offset, rowOffset;
273   PetscErrorCode                    ierr;
274 
275   PetscFunctionBegin;
276   if (A->valid_GPU_matrix == PETSC_CUDA_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUDA_CPU) {
277     try {
278       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
279       nzLower=n+ai[n]-ai[1];
280 
281       /* Allocate Space for the lower triangular matrix */
282       ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
283       ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(ierr);
284       ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(ierr);
285 
286       /* Fill the lower triangular matrix */
287       AiLo[0]  = (PetscInt) 0;
288       AiLo[n]  = nzLower;
289       AjLo[0]  = (PetscInt) 0;
290       AALo[0]  = (MatScalar) 1.0;
291       v        = aa;
292       vi       = aj;
293       offset   = 1;
294       rowOffset= 1;
295       for (i=1; i<n; i++) {
296         nz = ai[i+1] - ai[i];
297         /* additional 1 for the term on the diagonal */
298         AiLo[i]    = rowOffset;
299         rowOffset += nz+1;
300 
301         ierr = PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr);
302         ierr = PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
303 
304         offset      += nz;
305         AjLo[offset] = (PetscInt) i;
306         AALo[offset] = (MatScalar) 1.0;
307         offset      += 1;
308 
309         v  += nz;
310         vi += nz;
311       }
312 
313       /* allocate space for the triangular factor information */
314       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
315 
316       /* Create the matrix description */
317       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat);
318       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
319       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
320       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUDA(stat);
321       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat);
322 
323       /* Create the solve analysis information */
324       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat);
325 
326       /* set the operation */
327       loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
328 
329       /* set the matrix */
330       loTriFactor->csrMat = new CsrMatrix;
331       loTriFactor->csrMat->num_rows = n;
332       loTriFactor->csrMat->num_cols = n;
333       loTriFactor->csrMat->num_entries = nzLower;
334 
335       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
336       loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
337 
338       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
339       loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
340 
341       loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
342       loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
343 
344       /* perform the solve analysis */
345       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
346                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
347                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
348                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat);
349 
350       /* assign the pointer. Is this really necessary? */
351       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
352 
353       ierr = cudaFreeHost(AiLo);CHKERRCUDA(ierr);
354       ierr = cudaFreeHost(AjLo);CHKERRCUDA(ierr);
355       ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr);
356     } catch(char *ex) {
357       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
358     }
359   }
360   PetscFunctionReturn(0);
361 }
362 
363 #undef __FUNCT__
364 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildILUUpperTriMatrix"
365 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
366 {
367   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
368   PetscInt                          n = A->rmap->n;
369   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
370   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
371   cusparseStatus_t                  stat;
372   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
373   const MatScalar                   *aa = a->a,*v;
374   PetscInt                          *AiUp, *AjUp;
375   PetscScalar                       *AAUp;
376   PetscInt                          i,nz, nzUpper, offset;
377   PetscErrorCode                    ierr;
378 
379   PetscFunctionBegin;
380   if (A->valid_GPU_matrix == PETSC_CUDA_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUDA_CPU) {
381     try {
382       /* next, figure out the number of nonzeros in the upper triangular matrix. */
383       nzUpper = adiag[0]-adiag[n];
384 
385       /* Allocate Space for the upper triangular matrix */
386       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
387       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr);
388       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
389 
390       /* Fill the upper triangular matrix */
391       AiUp[0]=(PetscInt) 0;
392       AiUp[n]=nzUpper;
393       offset = nzUpper;
394       for (i=n-1; i>=0; i--) {
395         v  = aa + adiag[i+1] + 1;
396         vi = aj + adiag[i+1] + 1;
397 
398         /* number of elements NOT on the diagonal */
399         nz = adiag[i] - adiag[i+1]-1;
400 
401         /* decrement the offset */
402         offset -= (nz+1);
403 
404         /* first, set the diagonal elements */
405         AjUp[offset] = (PetscInt) i;
406         AAUp[offset] = (MatScalar)1./v[nz];
407         AiUp[i]      = AiUp[i+1] - (nz+1);
408 
409         ierr = PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr);
410         ierr = PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
411       }
412 
413       /* allocate space for the triangular factor information */
414       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
415 
416       /* Create the matrix description */
417       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat);
418       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
419       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
420       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
421       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat);
422 
423       /* Create the solve analysis information */
424       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat);
425 
426       /* set the operation */
427       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
428 
429       /* set the matrix */
430       upTriFactor->csrMat = new CsrMatrix;
431       upTriFactor->csrMat->num_rows = n;
432       upTriFactor->csrMat->num_cols = n;
433       upTriFactor->csrMat->num_entries = nzUpper;
434 
435       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
436       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
437 
438       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
439       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
440 
441       upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
442       upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
443 
444       /* perform the solve analysis */
445       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
446                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
447                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
448                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat);
449 
450       /* assign the pointer. Is this really necessary? */
451       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
452 
453       ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr);
454       ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr);
455       ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr);
456     } catch(char *ex) {
457       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
458     }
459   }
460   PetscFunctionReturn(0);
461 }
462 
463 #undef __FUNCT__
464 #define __FUNCT__ "MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU"
465 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
466 {
467   PetscErrorCode               ierr;
468   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
469   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
470   IS                           isrow = a->row,iscol = a->icol;
471   PetscBool                    row_identity,col_identity;
472   const PetscInt               *r,*c;
473   PetscInt                     n = A->rmap->n;
474 
475   PetscFunctionBegin;
476   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
477   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
478 
479   cusparseTriFactors->workVector = new THRUSTARRAY;
480   cusparseTriFactors->workVector->resize(n);
481   cusparseTriFactors->nnz=a->nz;
482 
483   A->valid_GPU_matrix = PETSC_CUDA_BOTH;
484   /*lower triangular indices */
485   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
486   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
487   if (!row_identity) {
488     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
489     cusparseTriFactors->rpermIndices->assign(r, r+n);
490   }
491   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
492 
493   /*upper triangular indices */
494   ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
495   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
496   if (!col_identity) {
497     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
498     cusparseTriFactors->cpermIndices->assign(c, c+n);
499   }
500   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
501   PetscFunctionReturn(0);
502 }
503 
504 #undef __FUNCT__
505 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildICCTriMatrices"
506 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
507 {
508   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
509   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
510   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
511   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
512   cusparseStatus_t                  stat;
513   PetscErrorCode                    ierr;
514   PetscInt                          *AiUp, *AjUp;
515   PetscScalar                       *AAUp;
516   PetscScalar                       *AALo;
517   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
518   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
519   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
520   const MatScalar                   *aa = b->a,*v;
521 
522   PetscFunctionBegin;
523   if (A->valid_GPU_matrix == PETSC_CUDA_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUDA_CPU) {
524     try {
525       /* Allocate Space for the upper triangular matrix */
526       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
527       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr);
528       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
529       ierr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
530 
531       /* Fill the upper triangular matrix */
532       AiUp[0]=(PetscInt) 0;
533       AiUp[n]=nzUpper;
534       offset = 0;
535       for (i=0; i<n; i++) {
536         /* set the pointers */
537         v  = aa + ai[i];
538         vj = aj + ai[i];
539         nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
540 
541         /* first, set the diagonal elements */
542         AjUp[offset] = (PetscInt) i;
543         AAUp[offset] = (MatScalar)1.0/v[nz];
544         AiUp[i]      = offset;
545         AALo[offset] = (MatScalar)1.0/v[nz];
546 
547         offset+=1;
548         if (nz>0) {
549           ierr = PetscMemcpy(&(AjUp[offset]), vj, nz*sizeof(PetscInt));CHKERRQ(ierr);
550           ierr = PetscMemcpy(&(AAUp[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
551           for (j=offset; j<offset+nz; j++) {
552             AAUp[j] = -AAUp[j];
553             AALo[j] = AAUp[j]/v[nz];
554           }
555           offset+=nz;
556         }
557       }
558 
559       /* allocate space for the triangular factor information */
560       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
561 
562       /* Create the matrix description */
563       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat);
564       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
565       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
566       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
567       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat);
568 
569       /* Create the solve analysis information */
570       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat);
571 
572       /* set the operation */
573       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
574 
575       /* set the matrix */
576       upTriFactor->csrMat = new CsrMatrix;
577       upTriFactor->csrMat->num_rows = A->rmap->n;
578       upTriFactor->csrMat->num_cols = A->cmap->n;
579       upTriFactor->csrMat->num_entries = a->nz;
580 
581       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
582       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
583 
584       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
585       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
586 
587       upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
588       upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
589 
590       /* perform the solve analysis */
591       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
592                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
593                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
594                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat);
595 
596       /* assign the pointer. Is this really necessary? */
597       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
598 
599       /* allocate space for the triangular factor information */
600       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
601 
602       /* Create the matrix description */
603       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat);
604       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
605       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
606       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
607       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat);
608 
609       /* Create the solve analysis information */
610       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat);
611 
612       /* set the operation */
613       loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
614 
615       /* set the matrix */
616       loTriFactor->csrMat = new CsrMatrix;
617       loTriFactor->csrMat->num_rows = A->rmap->n;
618       loTriFactor->csrMat->num_cols = A->cmap->n;
619       loTriFactor->csrMat->num_entries = a->nz;
620 
621       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
622       loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
623 
624       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
625       loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
626 
627       loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
628       loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
629 
630       /* perform the solve analysis */
631       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
632                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
633                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
634                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat);
635 
636       /* assign the pointer. Is this really necessary? */
637       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
638 
639       A->valid_GPU_matrix = PETSC_CUDA_BOTH;
640       ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr);
641       ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr);
642       ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr);
643       ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr);
644     } catch(char *ex) {
645       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
646     }
647   }
648   PetscFunctionReturn(0);
649 }
650 
651 #undef __FUNCT__
652 #define __FUNCT__ "MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU"
653 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
654 {
655   PetscErrorCode               ierr;
656   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
657   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
658   IS                           ip = a->row;
659   const PetscInt               *rip;
660   PetscBool                    perm_identity;
661   PetscInt                     n = A->rmap->n;
662 
663   PetscFunctionBegin;
664   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
665   cusparseTriFactors->workVector = new THRUSTARRAY;
666   cusparseTriFactors->workVector->resize(n);
667   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
668 
669   /*lower triangular indices */
670   ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
671   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
672   if (!perm_identity) {
673     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
674     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
675     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
676     cusparseTriFactors->cpermIndices->assign(rip, rip+n);
677   }
678   ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
679   PetscFunctionReturn(0);
680 }
681 
682 #undef __FUNCT__
683 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJCUSPARSE"
684 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
685 {
686   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
687   IS             isrow = b->row,iscol = b->col;
688   PetscBool      row_identity,col_identity;
689   PetscErrorCode ierr;
690 
691   PetscFunctionBegin;
692   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
693   /* determine which version of MatSolve needs to be used. */
694   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
695   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
696   if (row_identity && col_identity) {
697     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
698     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
699   } else {
700     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
701     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
702   }
703 
704   /* get the triangular factors */
705   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
706   PetscFunctionReturn(0);
707 }
708 
709 #undef __FUNCT__
710 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqAIJCUSPARSE"
711 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
712 {
713   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
714   IS             ip = b->row;
715   PetscBool      perm_identity;
716   PetscErrorCode ierr;
717 
718   PetscFunctionBegin;
719   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
720 
721   /* determine which version of MatSolve needs to be used. */
722   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
723   if (perm_identity) {
724     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
725     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
726   } else {
727     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
728     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
729   }
730 
731   /* get the triangular factors */
732   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
733   PetscFunctionReturn(0);
734 }
735 
736 #undef __FUNCT__
737 #define __FUNCT__ "MatSeqAIJCUSPARSEAnalyzeTransposeForSolve"
738 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
739 {
740   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
741   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
742   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
743   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
744   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
745   cusparseStatus_t                  stat;
746   cusparseIndexBase_t               indexBase;
747   cusparseMatrixType_t              matrixType;
748   cusparseFillMode_t                fillMode;
749   cusparseDiagType_t                diagType;
750 
751   PetscFunctionBegin;
752 
753   /*********************************************/
754   /* Now the Transpose of the Lower Tri Factor */
755   /*********************************************/
756 
757   /* allocate space for the transpose of the lower triangular factor */
758   loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
759 
760   /* set the matrix descriptors of the lower triangular factor */
761   matrixType = cusparseGetMatType(loTriFactor->descr);
762   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
763   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
764     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
765   diagType = cusparseGetMatDiagType(loTriFactor->descr);
766 
767   /* Create the matrix description */
768   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUDA(stat);
769   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUDA(stat);
770   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUDA(stat);
771   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUDA(stat);
772   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUDA(stat);
773 
774   /* Create the solve analysis information */
775   stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUDA(stat);
776 
777   /* set the operation */
778   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
779 
780   /* allocate GPU space for the CSC of the lower triangular factor*/
781   loTriFactorT->csrMat = new CsrMatrix;
782   loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows;
783   loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols;
784   loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
785   loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1);
786   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries);
787   loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries);
788 
789   /* compute the transpose of the lower triangular factor, i.e. the CSC */
790   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
791                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
792                           loTriFactor->csrMat->values->data().get(),
793                           loTriFactor->csrMat->row_offsets->data().get(),
794                           loTriFactor->csrMat->column_indices->data().get(),
795                           loTriFactorT->csrMat->values->data().get(),
796                           loTriFactorT->csrMat->column_indices->data().get(),
797                           loTriFactorT->csrMat->row_offsets->data().get(),
798                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
799 
800   /* perform the solve analysis on the transposed matrix */
801   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
802                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries,
803                            loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
804                            loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(),
805                            loTriFactorT->solveInfo);CHKERRCUDA(stat);
806 
807   /* assign the pointer. Is this really necessary? */
808   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
809 
810   /*********************************************/
811   /* Now the Transpose of the Upper Tri Factor */
812   /*********************************************/
813 
814   /* allocate space for the transpose of the upper triangular factor */
815   upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
816 
817   /* set the matrix descriptors of the upper triangular factor */
818   matrixType = cusparseGetMatType(upTriFactor->descr);
819   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
820   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
821     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
822   diagType = cusparseGetMatDiagType(upTriFactor->descr);
823 
824   /* Create the matrix description */
825   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUDA(stat);
826   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUDA(stat);
827   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUDA(stat);
828   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUDA(stat);
829   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUDA(stat);
830 
831   /* Create the solve analysis information */
832   stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUDA(stat);
833 
834   /* set the operation */
835   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
836 
837   /* allocate GPU space for the CSC of the upper triangular factor*/
838   upTriFactorT->csrMat = new CsrMatrix;
839   upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows;
840   upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols;
841   upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
842   upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1);
843   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries);
844   upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries);
845 
846   /* compute the transpose of the upper triangular factor, i.e. the CSC */
847   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
848                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
849                           upTriFactor->csrMat->values->data().get(),
850                           upTriFactor->csrMat->row_offsets->data().get(),
851                           upTriFactor->csrMat->column_indices->data().get(),
852                           upTriFactorT->csrMat->values->data().get(),
853                           upTriFactorT->csrMat->column_indices->data().get(),
854                           upTriFactorT->csrMat->row_offsets->data().get(),
855                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
856 
857   /* perform the solve analysis on the transposed matrix */
858   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
859                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries,
860                            upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
861                            upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(),
862                            upTriFactorT->solveInfo);CHKERRCUDA(stat);
863 
864   /* assign the pointer. Is this really necessary? */
865   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
866   PetscFunctionReturn(0);
867 }
868 
869 #undef __FUNCT__
870 #define __FUNCT__ "MatSeqAIJCUSPARSEGenerateTransposeForMult"
871 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
872 {
873   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
874   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
875   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
876   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
877   cusparseStatus_t             stat;
878   cusparseIndexBase_t          indexBase;
879   cudaError_t                  err;
880 
881   PetscFunctionBegin;
882 
883   /* allocate space for the triangular factor information */
884   matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
885   stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUDA(stat);
886   indexBase = cusparseGetMatIndexBase(matstruct->descr);
887   stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUDA(stat);
888   stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat);
889 
890   /* set alpha and beta */
891   err = cudaMalloc((void **)&(matstructT->alpha),sizeof(PetscScalar));CHKERRCUDA(err);
892   err = cudaMemcpy(matstructT->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
893   err = cudaMalloc((void **)&(matstructT->beta),sizeof(PetscScalar));CHKERRCUDA(err);
894   err = cudaMemcpy(matstructT->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
895   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
896 
897   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
898     CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
899     CsrMatrix *matrixT= new CsrMatrix;
900     matrixT->num_rows = A->rmap->n;
901     matrixT->num_cols = A->cmap->n;
902     matrixT->num_entries = a->nz;
903     matrixT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
904     matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
905     matrixT->values = new THRUSTARRAY(a->nz);
906 
907     /* compute the transpose of the upper triangular factor, i.e. the CSC */
908     indexBase = cusparseGetMatIndexBase(matstruct->descr);
909     stat = cusparse_csr2csc(cusparsestruct->handle, matrix->num_rows,
910                             matrix->num_cols, matrix->num_entries,
911                             matrix->values->data().get(),
912                             matrix->row_offsets->data().get(),
913                             matrix->column_indices->data().get(),
914                             matrixT->values->data().get(),
915                             matrixT->column_indices->data().get(),
916                             matrixT->row_offsets->data().get(),
917                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
918 
919     /* assign the pointer */
920     matstructT->mat = matrixT;
921 
922   } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
923 #if CUDA_VERSION>=5000
924     /* First convert HYB to CSR */
925     CsrMatrix *temp= new CsrMatrix;
926     temp->num_rows = A->rmap->n;
927     temp->num_cols = A->cmap->n;
928     temp->num_entries = a->nz;
929     temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
930     temp->column_indices = new THRUSTINTARRAY32(a->nz);
931     temp->values = new THRUSTARRAY(a->nz);
932 
933 
934     stat = cusparse_hyb2csr(cusparsestruct->handle,
935                             matstruct->descr, (cusparseHybMat_t)matstruct->mat,
936                             temp->values->data().get(),
937                             temp->row_offsets->data().get(),
938                             temp->column_indices->data().get());CHKERRCUDA(stat);
939 
940     /* Next, convert CSR to CSC (i.e. the matrix transpose) */
941     CsrMatrix *tempT= new CsrMatrix;
942     tempT->num_rows = A->rmap->n;
943     tempT->num_cols = A->cmap->n;
944     tempT->num_entries = a->nz;
945     tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
946     tempT->column_indices = new THRUSTINTARRAY32(a->nz);
947     tempT->values = new THRUSTARRAY(a->nz);
948 
949     stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
950                             temp->num_cols, temp->num_entries,
951                             temp->values->data().get(),
952                             temp->row_offsets->data().get(),
953                             temp->column_indices->data().get(),
954                             tempT->values->data().get(),
955                             tempT->column_indices->data().get(),
956                             tempT->row_offsets->data().get(),
957                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
958 
959     /* Last, convert CSC to HYB */
960     cusparseHybMat_t hybMat;
961     stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat);
962     cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
963       CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
964     stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
965                             matstructT->descr, tempT->values->data().get(),
966                             tempT->row_offsets->data().get(),
967                             tempT->column_indices->data().get(),
968                             hybMat, 0, partition);CHKERRCUDA(stat);
969 
970     /* assign the pointer */
971     matstructT->mat = hybMat;
972 
973     /* delete temporaries */
974     if (tempT) {
975       if (tempT->values) delete (THRUSTARRAY*) tempT->values;
976       if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
977       if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
978       delete (CsrMatrix*) tempT;
979     }
980     if (temp) {
981       if (temp->values) delete (THRUSTARRAY*) temp->values;
982       if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
983       if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
984       delete (CsrMatrix*) temp;
985     }
986 #else
987     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format for the Matrix Transpose (in MatMultTranspose) require CUDA 5.0 or later.");
988 #endif
989   }
990   /* assign the compressed row indices */
991   matstructT->cprowIndices = new THRUSTINTARRAY;
992 
993   /* assign the pointer */
994   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
995   PetscFunctionReturn(0);
996 }
997 
998 #undef __FUNCT__
999 #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE"
1000 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1001 {
1002   PetscInt                          n = xx->map->n;
1003   PetscScalar                       *xarray, *barray;
1004   thrust::device_ptr<PetscScalar>   xGPU,bGPU;
1005   cusparseStatus_t                  stat;
1006   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1007   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1008   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1009   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1010   PetscErrorCode                    ierr;
1011 
1012   PetscFunctionBegin;
1013   /* Analyze the matrix and create the transpose ... on the fly */
1014   if (!loTriFactorT && !upTriFactorT) {
1015     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1016     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1017     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1018   }
1019 
1020   /* Get the GPU pointers */
1021   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1022   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1023   xGPU = thrust::device_pointer_cast(xarray);
1024   bGPU = thrust::device_pointer_cast(barray);
1025 
1026   /* First, reorder with the row permutation */
1027   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1028                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1029                xGPU);
1030 
1031   /* First, solve U */
1032   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1033                         upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr,
1034                         upTriFactorT->csrMat->values->data().get(),
1035                         upTriFactorT->csrMat->row_offsets->data().get(),
1036                         upTriFactorT->csrMat->column_indices->data().get(),
1037                         upTriFactorT->solveInfo,
1038                         xarray, tempGPU->data().get());CHKERRCUDA(stat);
1039 
1040   /* Then, solve L */
1041   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1042                         loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr,
1043                         loTriFactorT->csrMat->values->data().get(),
1044                         loTriFactorT->csrMat->row_offsets->data().get(),
1045                         loTriFactorT->csrMat->column_indices->data().get(),
1046                         loTriFactorT->solveInfo,
1047                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1048 
1049   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1050   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1051                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1052                tempGPU->begin());
1053 
1054   /* Copy the temporary to the full solution. */
1055   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1056 
1057   /* restore */
1058   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1059   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1060   ierr = WaitForGPU();CHKERRCUDA(ierr);
1061 
1062   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1063   PetscFunctionReturn(0);
1064 }
1065 
1066 #undef __FUNCT__
1067 #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering"
1068 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1069 {
1070   PetscScalar                       *xarray, *barray;
1071   cusparseStatus_t                  stat;
1072   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1073   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1074   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1075   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1076   PetscErrorCode                    ierr;
1077 
1078   PetscFunctionBegin;
1079   /* Analyze the matrix and create the transpose ... on the fly */
1080   if (!loTriFactorT && !upTriFactorT) {
1081     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1082     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1083     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1084   }
1085 
1086   /* Get the GPU pointers */
1087   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1088   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1089 
1090   /* First, solve U */
1091   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1092                         upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr,
1093                         upTriFactorT->csrMat->values->data().get(),
1094                         upTriFactorT->csrMat->row_offsets->data().get(),
1095                         upTriFactorT->csrMat->column_indices->data().get(),
1096                         upTriFactorT->solveInfo,
1097                         barray, tempGPU->data().get());CHKERRCUDA(stat);
1098 
1099   /* Then, solve L */
1100   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1101                         loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr,
1102                         loTriFactorT->csrMat->values->data().get(),
1103                         loTriFactorT->csrMat->row_offsets->data().get(),
1104                         loTriFactorT->csrMat->column_indices->data().get(),
1105                         loTriFactorT->solveInfo,
1106                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1107 
1108   /* restore */
1109   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1110   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1111   ierr = WaitForGPU();CHKERRCUDA(ierr);
1112   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1113   PetscFunctionReturn(0);
1114 }
1115 
1116 #undef __FUNCT__
1117 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE"
1118 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1119 {
1120   PetscScalar                       *xarray, *barray;
1121   thrust::device_ptr<PetscScalar>   xGPU,bGPU;
1122   cusparseStatus_t                  stat;
1123   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1124   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1125   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1126   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1127   PetscErrorCode                    ierr;
1128   VecType                           t;
1129   PetscBool                         flg;
1130 
1131   PetscFunctionBegin;
1132   ierr = VecGetType(bb,&t);CHKERRQ(ierr);
1133   ierr = PetscStrcmp(t,VECSEQCUDA,&flg);CHKERRQ(ierr);
1134   if (!flg) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Vector of type %s passed into MatSolve_SeqAIJCUSPARSE (Arg #2). Can only deal with %s\n.",t,VECSEQCUDA);
1135   ierr = VecGetType(xx,&t);CHKERRQ(ierr);
1136   ierr = PetscStrcmp(t,VECSEQCUDA,&flg);CHKERRQ(ierr);
1137   if (!flg) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Vector of type %s passed into MatSolve_SeqAIJCUSPARSE (Arg #3). Can only deal with %s\n.",t,VECSEQCUDA);
1138 
1139   /* Get the GPU pointers */
1140   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1141   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1142   xGPU = thrust::device_pointer_cast(xarray);
1143   bGPU = thrust::device_pointer_cast(barray);
1144 
1145   /* First, reorder with the row permutation */
1146   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1147                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1148                xGPU);
1149 
1150   /* Next, solve L */
1151   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1152                         loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr,
1153                         loTriFactor->csrMat->values->data().get(),
1154                         loTriFactor->csrMat->row_offsets->data().get(),
1155                         loTriFactor->csrMat->column_indices->data().get(),
1156                         loTriFactor->solveInfo,
1157                         xarray, tempGPU->data().get());CHKERRCUDA(stat);
1158 
1159   /* Then, solve U */
1160   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1161                         upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr,
1162                         upTriFactor->csrMat->values->data().get(),
1163                         upTriFactor->csrMat->row_offsets->data().get(),
1164                         upTriFactor->csrMat->column_indices->data().get(),
1165                         upTriFactor->solveInfo,
1166                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1167 
1168   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1169   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1170                thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->end()),
1171                tempGPU->begin());
1172 
1173   /* Copy the temporary to the full solution. */
1174   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1175 
1176   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1177   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1178   ierr = WaitForGPU();CHKERRCUDA(ierr);
1179   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1180   PetscFunctionReturn(0);
1181 }
1182 
1183 #undef __FUNCT__
1184 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE_NaturalOrdering"
1185 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1186 {
1187   PetscScalar                       *xarray, *barray;
1188   cusparseStatus_t                  stat;
1189   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1190   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1191   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1192   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1193   PetscErrorCode                    ierr;
1194 
1195   PetscFunctionBegin;
1196   /* Get the GPU pointers */
1197   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1198   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1199 
1200   /* First, solve L */
1201   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1202                         loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr,
1203                         loTriFactor->csrMat->values->data().get(),
1204                         loTriFactor->csrMat->row_offsets->data().get(),
1205                         loTriFactor->csrMat->column_indices->data().get(),
1206                         loTriFactor->solveInfo,
1207                         barray, tempGPU->data().get());CHKERRCUDA(stat);
1208 
1209   /* Next, solve U */
1210   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1211                         upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr,
1212                         upTriFactor->csrMat->values->data().get(),
1213                         upTriFactor->csrMat->row_offsets->data().get(),
1214                         upTriFactor->csrMat->column_indices->data().get(),
1215                         upTriFactor->solveInfo,
1216                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1217 
1218   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1219   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1220   ierr = WaitForGPU();CHKERRCUDA(ierr);
1221   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1222   PetscFunctionReturn(0);
1223 }
1224 
1225 #undef __FUNCT__
1226 #define __FUNCT__ "MatSeqAIJCUSPARSECopyToGPU"
1227 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1228 {
1229 
1230   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1231   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1232   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1233   PetscInt                     m = A->rmap->n,*ii,*ridx;
1234   PetscErrorCode               ierr;
1235   cusparseStatus_t             stat;
1236   cudaError_t                  err;
1237 
1238   PetscFunctionBegin;
1239   if (A->valid_GPU_matrix == PETSC_CUDA_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUDA_CPU) {
1240     ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1241     Mat_SeqAIJCUSPARSEMultStruct_Destroy(&matstruct,cusparsestruct->format);
1242     try {
1243       cusparsestruct->nonzerorow=0;
1244       for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0);
1245 
1246       if (a->compressedrow.use) {
1247         m    = a->compressedrow.nrows;
1248         ii   = a->compressedrow.i;
1249         ridx = a->compressedrow.rindex;
1250       } else {
1251         /* Forcing compressed row on the GPU */
1252         int k=0;
1253         ierr = PetscMalloc1(cusparsestruct->nonzerorow+1, &ii);CHKERRQ(ierr);
1254         ierr = PetscMalloc1(cusparsestruct->nonzerorow, &ridx);CHKERRQ(ierr);
1255         ii[0]=0;
1256         for (int j = 0; j<m; j++) {
1257           if ((a->i[j+1]-a->i[j])>0) {
1258             ii[k]  = a->i[j];
1259             ridx[k]= j;
1260             k++;
1261           }
1262         }
1263         ii[cusparsestruct->nonzerorow] = a->nz;
1264         m = cusparsestruct->nonzerorow;
1265       }
1266 
1267       /* allocate space for the triangular factor information */
1268       matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1269       stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUDA(stat);
1270       stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
1271       stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat);
1272 
1273       err = cudaMalloc((void **)&(matstruct->alpha),sizeof(PetscScalar));CHKERRCUDA(err);
1274       err = cudaMemcpy(matstruct->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1275       err = cudaMalloc((void **)&(matstruct->beta),sizeof(PetscScalar));CHKERRCUDA(err);
1276       err = cudaMemcpy(matstruct->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1277       stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
1278 
1279       /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1280       if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1281 /* set the matrix */
1282         CsrMatrix *matrix= new CsrMatrix;
1283         matrix->num_rows = m;
1284         matrix->num_cols = A->cmap->n;
1285         matrix->num_entries = a->nz;
1286         matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1287         matrix->row_offsets->assign(ii, ii + m+1);
1288 
1289         matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1290         matrix->column_indices->assign(a->j, a->j+a->nz);
1291 
1292         matrix->values = new THRUSTARRAY(a->nz);
1293         matrix->values->assign(a->a, a->a+a->nz);
1294 
1295 /* assign the pointer */
1296         matstruct->mat = matrix;
1297 
1298       } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1299 #if CUDA_VERSION>=4020
1300         CsrMatrix *matrix= new CsrMatrix;
1301         matrix->num_rows = m;
1302         matrix->num_cols = A->cmap->n;
1303         matrix->num_entries = a->nz;
1304         matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1305         matrix->row_offsets->assign(ii, ii + m+1);
1306 
1307         matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1308         matrix->column_indices->assign(a->j, a->j+a->nz);
1309 
1310         matrix->values = new THRUSTARRAY(a->nz);
1311         matrix->values->assign(a->a, a->a+a->nz);
1312 
1313         cusparseHybMat_t hybMat;
1314         stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat);
1315         cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1316           CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1317         stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols,
1318                                 matstruct->descr, matrix->values->data().get(),
1319                                 matrix->row_offsets->data().get(),
1320                                 matrix->column_indices->data().get(),
1321                                 hybMat, 0, partition);CHKERRCUDA(stat);
1322         /* assign the pointer */
1323         matstruct->mat = hybMat;
1324 
1325         if (matrix) {
1326           if (matrix->values) delete (THRUSTARRAY*)matrix->values;
1327           if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices;
1328           if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets;
1329           delete (CsrMatrix*)matrix;
1330         }
1331 #endif
1332       }
1333 
1334       /* assign the compressed row indices */
1335       matstruct->cprowIndices = new THRUSTINTARRAY(m);
1336       matstruct->cprowIndices->assign(ridx,ridx+m);
1337 
1338       /* assign the pointer */
1339       cusparsestruct->mat = matstruct;
1340 
1341       if (!a->compressedrow.use) {
1342         ierr = PetscFree(ii);CHKERRQ(ierr);
1343         ierr = PetscFree(ridx);CHKERRQ(ierr);
1344       }
1345       cusparsestruct->workVector = new THRUSTARRAY;
1346       cusparsestruct->workVector->resize(m);
1347     } catch(char *ex) {
1348       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1349     }
1350     ierr = WaitForGPU();CHKERRCUDA(ierr);
1351 
1352     A->valid_GPU_matrix = PETSC_CUDA_BOTH;
1353 
1354     ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1355   }
1356   PetscFunctionReturn(0);
1357 }
1358 
1359 #undef __FUNCT__
1360 #define __FUNCT__ "MatCreateVecs_SeqAIJCUSPARSE"
1361 static PetscErrorCode MatCreateVecs_SeqAIJCUSPARSE(Mat mat, Vec *right, Vec *left)
1362 {
1363   PetscErrorCode ierr;
1364   PetscInt rbs,cbs;
1365 
1366   PetscFunctionBegin;
1367   ierr = MatGetBlockSizes(mat,&rbs,&cbs);CHKERRQ(ierr);
1368   if (right) {
1369     ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr);
1370     ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
1371     ierr = VecSetBlockSize(*right,cbs);CHKERRQ(ierr);
1372     ierr = VecSetType(*right,VECSEQCUDA);CHKERRQ(ierr);
1373     ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr);
1374   }
1375   if (left) {
1376     ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr);
1377     ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
1378     ierr = VecSetBlockSize(*left,rbs);CHKERRQ(ierr);
1379     ierr = VecSetType(*left,VECSEQCUDA);CHKERRQ(ierr);
1380     ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr);
1381   }
1382   PetscFunctionReturn(0);
1383 }
1384 
1385 struct VecCUDAPlusEquals
1386 {
1387   template <typename Tuple>
1388   __host__ __device__
1389   void operator()(Tuple t)
1390   {
1391     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1392   }
1393 };
1394 
1395 #undef __FUNCT__
1396 #define __FUNCT__ "MatMult_SeqAIJCUSPARSE"
1397 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1398 {
1399   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1400   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1401   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1402   PetscScalar                  *xarray,*yarray;
1403   PetscErrorCode               ierr;
1404   cusparseStatus_t             stat;
1405 
1406   PetscFunctionBegin;
1407   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
1408      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
1409   ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1410   ierr = VecCUDAGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
1411   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1412     CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1413     stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1414                              mat->num_rows, mat->num_cols, mat->num_entries,
1415                              matstruct->alpha, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(),
1416                              mat->column_indices->data().get(), xarray, matstruct->beta,
1417                              yarray);CHKERRCUDA(stat);
1418   } else {
1419 #if CUDA_VERSION>=4020
1420     cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1421     stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1422                              matstruct->alpha, matstruct->descr, hybMat,
1423                              xarray, matstruct->beta,
1424                              yarray);CHKERRCUDA(stat);
1425 #endif
1426   }
1427   ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1428   ierr = VecCUDARestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
1429   if (!cusparsestruct->stream) {
1430     ierr = WaitForGPU();CHKERRCUDA(ierr);
1431   }
1432   ierr = PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr);
1433   PetscFunctionReturn(0);
1434 }
1435 
1436 #undef __FUNCT__
1437 #define __FUNCT__ "MatMultTranspose_SeqAIJCUSPARSE"
1438 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1439 {
1440   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1441   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1442   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1443   PetscScalar                  *xarray,*yarray;
1444   PetscErrorCode               ierr;
1445   cusparseStatus_t             stat;
1446 
1447   PetscFunctionBegin;
1448   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
1449      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
1450   if (!matstructT) {
1451     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1452     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1453   }
1454   ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1455   ierr = VecCUDAGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
1456 
1457   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1458     CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1459     stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1460                              mat->num_rows, mat->num_cols,
1461                              mat->num_entries, matstructT->alpha, matstructT->descr,
1462                              mat->values->data().get(), mat->row_offsets->data().get(),
1463                              mat->column_indices->data().get(), xarray, matstructT->beta,
1464                              yarray);CHKERRCUDA(stat);
1465   } else {
1466 #if CUDA_VERSION>=4020
1467     cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1468     stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1469                              matstructT->alpha, matstructT->descr, hybMat,
1470                              xarray, matstructT->beta,
1471                              yarray);CHKERRCUDA(stat);
1472 #endif
1473   }
1474   ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1475   ierr = VecCUDARestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
1476   if (!cusparsestruct->stream) {
1477     ierr = WaitForGPU();CHKERRCUDA(ierr);
1478   }
1479   ierr = PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr);
1480   PetscFunctionReturn(0);
1481 }
1482 
1483 
1484 #undef __FUNCT__
1485 #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE"
1486 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1487 {
1488   Mat_SeqAIJ                      *a = (Mat_SeqAIJ*)A->data;
1489   Mat_SeqAIJCUSPARSE              *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1490   Mat_SeqAIJCUSPARSEMultStruct    *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1491   thrust::device_ptr<PetscScalar> zptr;
1492   PetscScalar                     *xarray,*zarray;
1493   PetscErrorCode                  ierr;
1494   cusparseStatus_t                stat;
1495 
1496   PetscFunctionBegin;
1497   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
1498      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
1499   try {
1500     ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);
1501     ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1502     ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
1503     zptr = thrust::device_pointer_cast(zarray);
1504 
1505     /* multiply add */
1506     if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1507       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1508     /* here we need to be careful to set the number of rows in the multiply to the
1509        number of compressed rows in the matrix ... which is equivalent to the
1510        size of the workVector */
1511       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1512                                mat->num_rows, mat->num_cols,
1513                                mat->num_entries, matstruct->alpha, matstruct->descr,
1514                                mat->values->data().get(), mat->row_offsets->data().get(),
1515                                mat->column_indices->data().get(), xarray, matstruct->beta,
1516                                cusparsestruct->workVector->data().get());CHKERRCUDA(stat);
1517     } else {
1518 #if CUDA_VERSION>=4020
1519       cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1520       if (cusparsestruct->workVector->size()) {
1521         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1522             matstruct->alpha, matstruct->descr, hybMat,
1523             xarray, matstruct->beta,
1524             cusparsestruct->workVector->data().get());CHKERRCUDA(stat);
1525       }
1526 #endif
1527     }
1528 
1529     /* scatter the data from the temporary into the full vector with a += operation */
1530     thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
1531         thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(),
1532         VecCUDAPlusEquals());
1533     ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1534     ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
1535 
1536   } catch(char *ex) {
1537     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1538   }
1539   ierr = WaitForGPU();CHKERRCUDA(ierr);
1540   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1541   PetscFunctionReturn(0);
1542 }
1543 
1544 #undef __FUNCT__
1545 #define __FUNCT__ "MatMultTransposeAdd_SeqAIJCUSPARSE"
1546 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1547 {
1548   Mat_SeqAIJ                      *a = (Mat_SeqAIJ*)A->data;
1549   Mat_SeqAIJCUSPARSE              *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1550   Mat_SeqAIJCUSPARSEMultStruct    *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1551   thrust::device_ptr<PetscScalar> zptr;
1552   PetscScalar                     *xarray,*zarray;
1553   PetscErrorCode                  ierr;
1554   cusparseStatus_t                stat;
1555 
1556   PetscFunctionBegin;
1557   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
1558      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
1559   if (!matstructT) {
1560     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1561     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1562   }
1563 
1564   try {
1565     ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);
1566     ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1567     ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
1568     zptr = thrust::device_pointer_cast(zarray);
1569 
1570     /* multiply add with matrix transpose */
1571     if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1572       CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1573       /* here we need to be careful to set the number of rows in the multiply to the
1574          number of compressed rows in the matrix ... which is equivalent to the
1575          size of the workVector */
1576       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1577                                mat->num_rows, mat->num_cols,
1578                                mat->num_entries, matstructT->alpha, matstructT->descr,
1579                                mat->values->data().get(), mat->row_offsets->data().get(),
1580                                mat->column_indices->data().get(), xarray, matstructT->beta,
1581                                cusparsestruct->workVector->data().get());CHKERRCUDA(stat);
1582     } else {
1583 #if CUDA_VERSION>=4020
1584       cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1585       if (cusparsestruct->workVector->size()) {
1586         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1587             matstructT->alpha, matstructT->descr, hybMat,
1588             xarray, matstructT->beta,
1589             cusparsestruct->workVector->data().get());CHKERRCUDA(stat);
1590       }
1591 #endif
1592     }
1593 
1594     /* scatter the data from the temporary into the full vector with a += operation */
1595     thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))),
1596         thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))) + cusparsestruct->workVector->size(),
1597         VecCUDAPlusEquals());
1598 
1599     ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1600     ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
1601 
1602   } catch(char *ex) {
1603     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1604   }
1605   ierr = WaitForGPU();CHKERRCUDA(ierr);
1606   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1607   PetscFunctionReturn(0);
1608 }
1609 
1610 #undef __FUNCT__
1611 #define __FUNCT__ "MatAssemblyEnd_SeqAIJCUSPARSE"
1612 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
1613 {
1614   PetscErrorCode ierr;
1615 
1616   PetscFunctionBegin;
1617   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
1618   if (A->factortype==MAT_FACTOR_NONE) {
1619     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1620   }
1621   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
1622   A->ops->mult             = MatMult_SeqAIJCUSPARSE;
1623   A->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1624   A->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1625   A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1626   PetscFunctionReturn(0);
1627 }
1628 
1629 /* --------------------------------------------------------------------------------*/
1630 #undef __FUNCT__
1631 #define __FUNCT__ "MatCreateSeqAIJCUSPARSE"
1632 /*@
1633    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
1634    (the default parallel PETSc format). This matrix will ultimately pushed down
1635    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
1636    assembly performance the user should preallocate the matrix storage by setting
1637    the parameter nz (or the array nnz).  By setting these parameters accurately,
1638    performance during matrix assembly can be increased by more than a factor of 50.
1639 
1640    Collective on MPI_Comm
1641 
1642    Input Parameters:
1643 +  comm - MPI communicator, set to PETSC_COMM_SELF
1644 .  m - number of rows
1645 .  n - number of columns
1646 .  nz - number of nonzeros per row (same for all rows)
1647 -  nnz - array containing the number of nonzeros in the various rows
1648          (possibly different for each row) or NULL
1649 
1650    Output Parameter:
1651 .  A - the matrix
1652 
1653    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1654    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1655    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
1656 
1657    Notes:
1658    If nnz is given then nz is ignored
1659 
1660    The AIJ format (also called the Yale sparse matrix format or
1661    compressed row storage), is fully compatible with standard Fortran 77
1662    storage.  That is, the stored row and column indices can begin at
1663    either one (as in Fortran) or zero.  See the users' manual for details.
1664 
1665    Specify the preallocated storage with either nz or nnz (not both).
1666    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1667    allocation.  For large problems you MUST preallocate memory or you
1668    will get TERRIBLE performance, see the users' manual chapter on matrices.
1669 
1670    By default, this format uses inodes (identical nodes) when possible, to
1671    improve numerical efficiency of matrix-vector products and solves. We
1672    search for consecutive rows with the same nonzero structure, thereby
1673    reusing matrix information to achieve increased efficiency.
1674 
1675    Level: intermediate
1676 
1677 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
1678 @*/
1679 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1680 {
1681   PetscErrorCode ierr;
1682 
1683   PetscFunctionBegin;
1684   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1685   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
1686   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1687   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
1688   PetscFunctionReturn(0);
1689 }
1690 
1691 #undef __FUNCT__
1692 #define __FUNCT__ "MatDestroy_SeqAIJCUSPARSE"
1693 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
1694 {
1695   PetscErrorCode   ierr;
1696 
1697   PetscFunctionBegin;
1698   if (A->factortype==MAT_FACTOR_NONE) {
1699     if (A->valid_GPU_matrix != PETSC_CUDA_UNALLOCATED) {
1700       ierr = Mat_SeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
1701     }
1702   } else {
1703     ierr = Mat_SeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
1704   }
1705   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
1706   PetscFunctionReturn(0);
1707 }
1708 
1709 #undef __FUNCT__
1710 #define __FUNCT__ "MatCreate_SeqAIJCUSPARSE"
1711 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
1712 {
1713   PetscErrorCode ierr;
1714   cusparseStatus_t stat;
1715   cusparseHandle_t handle=0;
1716 
1717   PetscFunctionBegin;
1718   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
1719   if (B->factortype==MAT_FACTOR_NONE) {
1720     /* you cannot check the inode.use flag here since the matrix was just created.
1721        now build a GPU matrix data structure */
1722     B->spptr = new Mat_SeqAIJCUSPARSE;
1723     ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat          = 0;
1724     ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0;
1725     ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector   = 0;
1726     ((Mat_SeqAIJCUSPARSE*)B->spptr)->format       = MAT_CUSPARSE_CSR;
1727     ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream       = 0;
1728     ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle       = 0;
1729     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1730     ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle       = handle;
1731     ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream       = 0;
1732   } else {
1733     /* NEXT, set the pointers to the triangular factors */
1734     B->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1735     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr          = 0;
1736     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr          = 0;
1737     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0;
1738     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0;
1739     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices            = 0;
1740     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices            = 0;
1741     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector              = 0;
1742     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle                  = 0;
1743     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1744     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle                  = handle;
1745     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz                     = 0;
1746   }
1747 
1748   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
1749   B->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
1750   B->ops->getvecs          = MatCreateVecs_SeqAIJCUSPARSE;
1751   B->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
1752   B->ops->mult             = MatMult_SeqAIJCUSPARSE;
1753   B->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1754   B->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1755   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1756 
1757   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1758 
1759   B->valid_GPU_matrix = PETSC_CUDA_UNALLOCATED;
1760 
1761   ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
1762   PetscFunctionReturn(0);
1763 }
1764 
1765 /*M
1766    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
1767 
1768    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
1769    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
1770    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
1771 
1772    Options Database Keys:
1773 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
1774 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1775 .  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1776 
1777   Level: beginner
1778 
1779 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
1780 M*/
1781 
1782 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);
1783 
1784 
1785 #undef __FUNCT__
1786 #define __FUNCT__ "MatSolverPackageRegister_CUSPARSE"
1787 PETSC_EXTERN PetscErrorCode MatSolverPackageRegister_CUSPARSE(void)
1788 {
1789   PetscErrorCode ierr;
1790 
1791   PetscFunctionBegin;
1792   ierr = MatSolverPackageRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1793   ierr = MatSolverPackageRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1794   ierr = MatSolverPackageRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1795   ierr = MatSolverPackageRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1796   PetscFunctionReturn(0);
1797 }
1798 
1799 
1800 #undef __FUNCT__
1801 #define __FUNCT__ "Mat_SeqAIJCUSPARSE_Destroy"
1802 static PetscErrorCode Mat_SeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
1803 {
1804   cusparseStatus_t stat;
1805   cusparseHandle_t handle;
1806 
1807   PetscFunctionBegin;
1808   if (*cusparsestruct) {
1809     Mat_SeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);
1810     Mat_SeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);
1811     delete (*cusparsestruct)->workVector;
1812     if (handle = (*cusparsestruct)->handle) {
1813       stat = cusparseDestroy(handle);CHKERRCUDA(stat);
1814     }
1815     delete *cusparsestruct;
1816     *cusparsestruct = 0;
1817   }
1818   PetscFunctionReturn(0);
1819 }
1820 
1821 #undef __FUNCT__
1822 #define __FUNCT__ "CsrMatrix_Destroy"
1823 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
1824 {
1825   PetscFunctionBegin;
1826   if (*mat) {
1827     delete (*mat)->values;
1828     delete (*mat)->column_indices;
1829     delete (*mat)->row_offsets;
1830     delete *mat;
1831     *mat = 0;
1832   }
1833   PetscFunctionReturn(0);
1834 }
1835 
1836 #undef __FUNCT__
1837 #define __FUNCT__ "Mat_SeqAIJCUSPARSETriFactorStruct_Destroy"
1838 static PetscErrorCode Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
1839 {
1840   cusparseStatus_t stat;
1841   PetscErrorCode   ierr;
1842 
1843   PetscFunctionBegin;
1844   if (*trifactor) {
1845     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUDA(stat); }
1846     if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUDA(stat); }
1847     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
1848     delete *trifactor;
1849     *trifactor = 0;
1850   }
1851   PetscFunctionReturn(0);
1852 }
1853 
1854 #undef __FUNCT__
1855 #define __FUNCT__ "Mat_SeqAIJCUSPARSEMultStruct_Destroy"
1856 static PetscErrorCode Mat_SeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
1857 {
1858   CsrMatrix        *mat;
1859   cusparseStatus_t stat;
1860   cudaError_t      err;
1861 
1862   PetscFunctionBegin;
1863   if (*matstruct) {
1864     if ((*matstruct)->mat) {
1865       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
1866         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
1867         stat = cusparseDestroyHybMat(hybMat);CHKERRCUDA(stat);
1868       } else {
1869         mat = (CsrMatrix*)(*matstruct)->mat;
1870         CsrMatrix_Destroy(&mat);
1871       }
1872     }
1873     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUDA(stat); }
1874     delete (*matstruct)->cprowIndices;
1875     if ((*matstruct)->alpha) { err=cudaFree((*matstruct)->alpha);CHKERRCUDA(err); }
1876     if ((*matstruct)->beta) { err=cudaFree((*matstruct)->beta);CHKERRCUDA(err); }
1877     delete *matstruct;
1878     *matstruct = 0;
1879   }
1880   PetscFunctionReturn(0);
1881 }
1882 
1883 #undef __FUNCT__
1884 #define __FUNCT__ "Mat_SeqAIJCUSPARSETriFactors_Destroy"
1885 static PetscErrorCode Mat_SeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
1886 {
1887   cusparseHandle_t handle;
1888   cusparseStatus_t stat;
1889 
1890   PetscFunctionBegin;
1891   if (*trifactors) {
1892     Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->loTriFactorPtr);
1893     Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->upTriFactorPtr);
1894     Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);
1895     Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);
1896     delete (*trifactors)->rpermIndices;
1897     delete (*trifactors)->cpermIndices;
1898     delete (*trifactors)->workVector;
1899     if (handle = (*trifactors)->handle) {
1900       stat = cusparseDestroy(handle);CHKERRCUDA(stat);
1901     }
1902     delete *trifactors;
1903     *trifactors = 0;
1904   }
1905   PetscFunctionReturn(0);
1906 }
1907 
1908