xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision af0996ce37bc06907c37d8d91773840993d61e62)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 
6 #include <petscconf.h>
7 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
8 #include <../src/mat/impls/sbaij/seq/sbaij.h>
9 #include <../src/vec/vec/impls/dvecimpl.h>
10 #include <petsc/private/vecimpl.h>
11 #undef VecType
12 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
13 
14 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
15 
16 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
17 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
18 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
19 
20 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
21 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
22 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
23 
24 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
25 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
26 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
27 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
28 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptions *PetscOptionsObject,Mat);
29 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
30 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
31 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
32 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
33 
34 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
35 static PetscErrorCode Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
36 static PetscErrorCode Mat_SeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
37 static PetscErrorCode Mat_SeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
38 static PetscErrorCode Mat_SeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
39 
40 #undef __FUNCT__
41 #define __FUNCT__ "MatCUSPARSESetStream"
42 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
43 {
44   cusparseStatus_t   stat;
45   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
46 
47   PetscFunctionBegin;
48   cusparsestruct->stream = stream;
49   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSP(stat);
50   PetscFunctionReturn(0);
51 }
52 
53 #undef __FUNCT__
54 #define __FUNCT__ "MatCUSPARSESetHandle"
55 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
56 {
57   cusparseStatus_t   stat;
58   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
59 
60   PetscFunctionBegin;
61   if (cusparsestruct->handle)
62     stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSP(stat);
63   cusparsestruct->handle = handle;
64   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSP(stat);
65   PetscFunctionReturn(0);
66 }
67 
68 #undef __FUNCT__
69 #define __FUNCT__ "MatCUSPARSEClearHandle"
70 PetscErrorCode MatCUSPARSEClearHandle(Mat A)
71 {
72   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
73   PetscFunctionBegin;
74   if (cusparsestruct->handle)
75     cusparsestruct->handle = 0;
76   PetscFunctionReturn(0);
77 }
78 
79 #undef __FUNCT__
80 #define __FUNCT__ "MatFactorGetSolverPackage_seqaij_cusparse"
81 PetscErrorCode MatFactorGetSolverPackage_seqaij_cusparse(Mat A,const MatSolverPackage *type)
82 {
83   PetscFunctionBegin;
84   *type = MATSOLVERCUSPARSE;
85   PetscFunctionReturn(0);
86 }
87 
88 /*MC
89   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
90   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
91   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
92   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
93   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
94   algorithms are not recommended. This class does NOT support direct solver operations.
95 
96   Level: beginner
97 
98 .seealso: PCFactorSetMatSolverPackage(), MatSolverPackage, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
99 M*/
100 
101 #undef __FUNCT__
102 #define __FUNCT__ "MatGetFactor_seqaijcusparse_cusparse"
103 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
104 {
105   PetscErrorCode ierr;
106   PetscInt       n = A->rmap->n;
107 
108   PetscFunctionBegin;
109   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
110   (*B)->factortype = ftype;
111   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
112   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
113 
114   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
115     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
116     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
117     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
118   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
119     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
120     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
121   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
122 
123   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
124   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverPackage_C",MatFactorGetSolverPackage_seqaij_cusparse);CHKERRQ(ierr);
125   PetscFunctionReturn(0);
126 }
127 
128 #undef __FUNCT__
129 #define __FUNCT__ "MatCUSPARSESetFormat_SeqAIJCUSPARSE"
130 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
131 {
132   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
133 
134   PetscFunctionBegin;
135 #if CUDA_VERSION>=4020
136   switch (op) {
137   case MAT_CUSPARSE_MULT:
138     cusparsestruct->format = format;
139     break;
140   case MAT_CUSPARSE_ALL:
141     cusparsestruct->format = format;
142     break;
143   default:
144     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
145   }
146 #else
147   if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB)
148     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format require CUDA 4.2 or later.");
149 #endif
150   PetscFunctionReturn(0);
151 }
152 
153 /*@
154    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
155    operation. Only the MatMult operation can use different GPU storage formats
156    for MPIAIJCUSPARSE matrices.
157    Not Collective
158 
159    Input Parameters:
160 +  A - Matrix of type SEQAIJCUSPARSE
161 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
162 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
163 
164    Output Parameter:
165 
166    Level: intermediate
167 
168 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
169 @*/
170 #undef __FUNCT__
171 #define __FUNCT__ "MatCUSPARSESetFormat"
172 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
173 {
174   PetscErrorCode ierr;
175 
176   PetscFunctionBegin;
177   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
178   ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
179   PetscFunctionReturn(0);
180 }
181 
182 #undef __FUNCT__
183 #define __FUNCT__ "MatSetFromOptions_SeqAIJCUSPARSE"
184 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptions *PetscOptionsObject,Mat A)
185 {
186   PetscErrorCode           ierr;
187   MatCUSPARSEStorageFormat format;
188   PetscBool                flg;
189   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
190 
191   PetscFunctionBegin;
192   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
193   ierr = PetscObjectOptionsBegin((PetscObject)A);
194   if (A->factortype==MAT_FACTOR_NONE) {
195     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
196                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
197     if (flg) {
198       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);
199     }
200   }
201   ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
202                           "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
203   if (flg) {
204     ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
205   }
206   ierr = PetscOptionsEnd();CHKERRQ(ierr);
207   PetscFunctionReturn(0);
208 
209 }
210 
211 #undef __FUNCT__
212 #define __FUNCT__ "MatILUFactorSymbolic_SeqAIJCUSPARSE"
213 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
214 {
215   PetscErrorCode ierr;
216 
217   PetscFunctionBegin;
218   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
219   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
220   PetscFunctionReturn(0);
221 }
222 
223 #undef __FUNCT__
224 #define __FUNCT__ "MatLUFactorSymbolic_SeqAIJCUSPARSE"
225 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
226 {
227   PetscErrorCode ierr;
228 
229   PetscFunctionBegin;
230   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
231   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
232   PetscFunctionReturn(0);
233 }
234 
235 #undef __FUNCT__
236 #define __FUNCT__ "MatICCFactorSymbolic_SeqAIJCUSPARSE"
237 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
238 {
239   PetscErrorCode ierr;
240 
241   PetscFunctionBegin;
242   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
243   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
244   PetscFunctionReturn(0);
245 }
246 
247 #undef __FUNCT__
248 #define __FUNCT__ "MatCholeskyFactorSymbolic_SeqAIJCUSPARSE"
249 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
250 {
251   PetscErrorCode ierr;
252 
253   PetscFunctionBegin;
254   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
255   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
256   PetscFunctionReturn(0);
257 }
258 
259 #undef __FUNCT__
260 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildILULowerTriMatrix"
261 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
262 {
263   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
264   PetscInt                          n = A->rmap->n;
265   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
266   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
267   cusparseStatus_t                  stat;
268   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
269   const MatScalar                   *aa = a->a,*v;
270   PetscInt                          *AiLo, *AjLo;
271   PetscScalar                       *AALo;
272   PetscInt                          i,nz, nzLower, offset, rowOffset;
273   PetscErrorCode                    ierr;
274 
275   PetscFunctionBegin;
276   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
277     try {
278       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
279       nzLower=n+ai[n]-ai[1];
280 
281       /* Allocate Space for the lower triangular matrix */
282       ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
283       ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUSP(ierr);
284       ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUSP(ierr);
285 
286       /* Fill the lower triangular matrix */
287       AiLo[0]  = (PetscInt) 0;
288       AiLo[n]  = nzLower;
289       AjLo[0]  = (PetscInt) 0;
290       AALo[0]  = (MatScalar) 1.0;
291       v        = aa;
292       vi       = aj;
293       offset   = 1;
294       rowOffset= 1;
295       for (i=1; i<n; i++) {
296         nz = ai[i+1] - ai[i];
297         /* additional 1 for the term on the diagonal */
298         AiLo[i]    = rowOffset;
299         rowOffset += nz+1;
300 
301         ierr = PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr);
302         ierr = PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
303 
304         offset      += nz;
305         AjLo[offset] = (PetscInt) i;
306         AALo[offset] = (MatScalar) 1.0;
307         offset      += 1;
308 
309         v  += nz;
310         vi += nz;
311       }
312 
313       /* allocate space for the triangular factor information */
314       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
315 
316       /* Create the matrix description */
317       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSP(stat);
318       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat);
319       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat);
320       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSP(stat);
321       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSP(stat);
322 
323       /* Create the solve analysis information */
324       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSP(stat);
325 
326       /* set the operation */
327       loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
328 
329       /* set the matrix */
330       loTriFactor->csrMat = new CsrMatrix;
331       loTriFactor->csrMat->num_rows = n;
332       loTriFactor->csrMat->num_cols = n;
333       loTriFactor->csrMat->num_entries = nzLower;
334 
335       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
336       loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
337 
338       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
339       loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
340 
341       loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
342       loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
343 
344       /* perform the solve analysis */
345       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
346                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
347                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
348                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSP(stat);
349 
350       /* assign the pointer. Is this really necessary? */
351       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
352 
353       ierr = cudaFreeHost(AiLo);CHKERRCUSP(ierr);
354       ierr = cudaFreeHost(AjLo);CHKERRCUSP(ierr);
355       ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr);
356     } catch(char *ex) {
357       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
358     }
359   }
360   PetscFunctionReturn(0);
361 }
362 
363 #undef __FUNCT__
364 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildILUUpperTriMatrix"
365 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
366 {
367   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
368   PetscInt                          n = A->rmap->n;
369   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
370   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
371   cusparseStatus_t                  stat;
372   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
373   const MatScalar                   *aa = a->a,*v;
374   PetscInt                          *AiUp, *AjUp;
375   PetscScalar                       *AAUp;
376   PetscInt                          i,nz, nzUpper, offset;
377   PetscErrorCode                    ierr;
378 
379   PetscFunctionBegin;
380   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
381     try {
382       /* next, figure out the number of nonzeros in the upper triangular matrix. */
383       nzUpper = adiag[0]-adiag[n];
384 
385       /* Allocate Space for the upper triangular matrix */
386       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
387       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr);
388       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr);
389 
390       /* Fill the upper triangular matrix */
391       AiUp[0]=(PetscInt) 0;
392       AiUp[n]=nzUpper;
393       offset = nzUpper;
394       for (i=n-1; i>=0; i--) {
395         v  = aa + adiag[i+1] + 1;
396         vi = aj + adiag[i+1] + 1;
397 
398         /* number of elements NOT on the diagonal */
399         nz = adiag[i] - adiag[i+1]-1;
400 
401         /* decrement the offset */
402         offset -= (nz+1);
403 
404         /* first, set the diagonal elements */
405         AjUp[offset] = (PetscInt) i;
406         AAUp[offset] = 1./v[nz];
407         AiUp[i]      = AiUp[i+1] - (nz+1);
408 
409         ierr = PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr);
410         ierr = PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
411       }
412 
413       /* allocate space for the triangular factor information */
414       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
415 
416       /* Create the matrix description */
417       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSP(stat);
418       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat);
419       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat);
420       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat);
421       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat);
422 
423       /* Create the solve analysis information */
424       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSP(stat);
425 
426       /* set the operation */
427       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
428 
429       /* set the matrix */
430       upTriFactor->csrMat = new CsrMatrix;
431       upTriFactor->csrMat->num_rows = n;
432       upTriFactor->csrMat->num_cols = n;
433       upTriFactor->csrMat->num_entries = nzUpper;
434 
435       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
436       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
437 
438       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
439       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
440 
441       upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
442       upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
443 
444       /* perform the solve analysis */
445       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
446                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
447                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
448                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSP(stat);
449 
450       /* assign the pointer. Is this really necessary? */
451       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
452 
453       ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr);
454       ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr);
455       ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr);
456     } catch(char *ex) {
457       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
458     }
459   }
460   PetscFunctionReturn(0);
461 }
462 
463 #undef __FUNCT__
464 #define __FUNCT__ "MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU"
465 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
466 {
467   PetscErrorCode               ierr;
468   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
469   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
470   IS                           isrow = a->row,iscol = a->icol;
471   PetscBool                    row_identity,col_identity;
472   const PetscInt               *r,*c;
473   PetscInt                     n = A->rmap->n;
474 
475   PetscFunctionBegin;
476   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
477   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
478 
479   cusparseTriFactors->workVector = new THRUSTARRAY;
480   cusparseTriFactors->workVector->resize(n);
481   cusparseTriFactors->nnz=a->nz;
482 
483   A->valid_GPU_matrix = PETSC_CUSP_BOTH;
484   /*lower triangular indices */
485   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
486   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
487   if (!row_identity) {
488     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
489     cusparseTriFactors->rpermIndices->assign(r, r+n);
490   }
491   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
492 
493   /*upper triangular indices */
494   ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
495   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
496   if (!col_identity) {
497     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
498     cusparseTriFactors->cpermIndices->assign(c, c+n);
499   }
500   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
501   PetscFunctionReturn(0);
502 }
503 
504 #undef __FUNCT__
505 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildICCTriMatrices"
506 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
507 {
508   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
509   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
510   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
511   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
512   cusparseStatus_t                  stat;
513   PetscErrorCode                    ierr;
514   PetscInt                          *AiUp, *AjUp;
515   PetscScalar                       *AAUp;
516   PetscScalar                       *AALo;
517   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
518   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
519   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
520   const MatScalar                   *aa = b->a,*v;
521 
522   PetscFunctionBegin;
523   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
524     try {
525       /* Allocate Space for the upper triangular matrix */
526       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
527       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr);
528       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr);
529       ierr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr);
530 
531       /* Fill the upper triangular matrix */
532       AiUp[0]=(PetscInt) 0;
533       AiUp[n]=nzUpper;
534       offset = 0;
535       for (i=0; i<n; i++) {
536         /* set the pointers */
537         v  = aa + ai[i];
538         vj = aj + ai[i];
539         nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
540 
541         /* first, set the diagonal elements */
542         AjUp[offset] = (PetscInt) i;
543         AAUp[offset] = 1.0/v[nz];
544         AiUp[i]      = offset;
545         AALo[offset] = 1.0/v[nz];
546 
547         offset+=1;
548         if (nz>0) {
549           ierr = PetscMemcpy(&(AjUp[offset]), vj, nz*sizeof(PetscInt));CHKERRQ(ierr);
550           ierr = PetscMemcpy(&(AAUp[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
551           for (j=offset; j<offset+nz; j++) {
552             AAUp[j] = -AAUp[j];
553             AALo[j] = AAUp[j]/v[nz];
554           }
555           offset+=nz;
556         }
557       }
558 
559       /* allocate space for the triangular factor information */
560       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
561 
562       /* Create the matrix description */
563       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSP(stat);
564       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat);
565       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat);
566       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat);
567       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSP(stat);
568 
569       /* Create the solve analysis information */
570       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSP(stat);
571 
572       /* set the operation */
573       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
574 
575       /* set the matrix */
576       upTriFactor->csrMat = new CsrMatrix;
577       upTriFactor->csrMat->num_rows = A->rmap->n;
578       upTriFactor->csrMat->num_cols = A->cmap->n;
579       upTriFactor->csrMat->num_entries = a->nz;
580 
581       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
582       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
583 
584       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
585       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
586 
587       upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
588       upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
589 
590       /* perform the solve analysis */
591       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
592                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
593                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
594                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSP(stat);
595 
596       /* assign the pointer. Is this really necessary? */
597       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
598 
599       /* allocate space for the triangular factor information */
600       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
601 
602       /* Create the matrix description */
603       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSP(stat);
604       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat);
605       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat);
606       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat);
607       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat);
608 
609       /* Create the solve analysis information */
610       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSP(stat);
611 
612       /* set the operation */
613       loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
614 
615       /* set the matrix */
616       loTriFactor->csrMat = new CsrMatrix;
617       loTriFactor->csrMat->num_rows = A->rmap->n;
618       loTriFactor->csrMat->num_cols = A->cmap->n;
619       loTriFactor->csrMat->num_entries = a->nz;
620 
621       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
622       loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
623 
624       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
625       loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
626 
627       loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
628       loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
629 
630       /* perform the solve analysis */
631       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
632                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
633                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
634                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSP(stat);
635 
636       /* assign the pointer. Is this really necessary? */
637       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
638 
639       A->valid_GPU_matrix = PETSC_CUSP_BOTH;
640       ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr);
641       ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr);
642       ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr);
643       ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr);
644     } catch(char *ex) {
645       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
646     }
647   }
648   PetscFunctionReturn(0);
649 }
650 
651 #undef __FUNCT__
652 #define __FUNCT__ "MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU"
653 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
654 {
655   PetscErrorCode               ierr;
656   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
657   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
658   IS                           ip = a->row;
659   const PetscInt               *rip;
660   PetscBool                    perm_identity;
661   PetscInt                     n = A->rmap->n;
662 
663   PetscFunctionBegin;
664   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
665   cusparseTriFactors->workVector = new THRUSTARRAY;
666   cusparseTriFactors->workVector->resize(n);
667   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
668 
669   /*lower triangular indices */
670   ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
671   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
672   if (!perm_identity) {
673     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
674     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
675     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
676     cusparseTriFactors->cpermIndices->assign(rip, rip+n);
677   }
678   ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
679   PetscFunctionReturn(0);
680 }
681 
682 #undef __FUNCT__
683 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJCUSPARSE"
684 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
685 {
686   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
687   IS             isrow = b->row,iscol = b->col;
688   PetscBool      row_identity,col_identity;
689   PetscErrorCode ierr;
690 
691   PetscFunctionBegin;
692   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
693   /* determine which version of MatSolve needs to be used. */
694   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
695   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
696   if (row_identity && col_identity) {
697     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
698     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
699   } else {
700     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
701     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
702   }
703 
704   /* get the triangular factors */
705   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
706   PetscFunctionReturn(0);
707 }
708 
709 #undef __FUNCT__
710 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqAIJCUSPARSE"
711 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
712 {
713   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
714   IS             ip = b->row;
715   PetscBool      perm_identity;
716   PetscErrorCode ierr;
717 
718   PetscFunctionBegin;
719   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
720 
721   /* determine which version of MatSolve needs to be used. */
722   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
723   if (perm_identity) {
724     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
725     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
726   } else {
727     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
728     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
729   }
730 
731   /* get the triangular factors */
732   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
733   PetscFunctionReturn(0);
734 }
735 
736 #undef __FUNCT__
737 #define __FUNCT__ "MatSeqAIJCUSPARSEAnalyzeTransposeForSolve"
738 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
739 {
740   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
741   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
742   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
743   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
744   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
745   cusparseStatus_t                  stat;
746   cusparseIndexBase_t               indexBase;
747   cusparseMatrixType_t              matrixType;
748   cusparseFillMode_t                fillMode;
749   cusparseDiagType_t                diagType;
750 
751   PetscFunctionBegin;
752 
753   /*********************************************/
754   /* Now the Transpose of the Lower Tri Factor */
755   /*********************************************/
756 
757   /* allocate space for the transpose of the lower triangular factor */
758   loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
759 
760   /* set the matrix descriptors of the lower triangular factor */
761   matrixType = cusparseGetMatType(loTriFactor->descr);
762   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
763   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
764     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
765   diagType = cusparseGetMatDiagType(loTriFactor->descr);
766 
767   /* Create the matrix description */
768   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSP(stat);
769   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSP(stat);
770   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSP(stat);
771   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSP(stat);
772   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSP(stat);
773 
774   /* Create the solve analysis information */
775   stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUSP(stat);
776 
777   /* set the operation */
778   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
779 
780   /* allocate GPU space for the CSC of the lower triangular factor*/
781   loTriFactorT->csrMat = new CsrMatrix;
782   loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows;
783   loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols;
784   loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
785   loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1);
786   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries);
787   loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries);
788 
789   /* compute the transpose of the lower triangular factor, i.e. the CSC */
790   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
791                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
792                           loTriFactor->csrMat->values->data().get(),
793                           loTriFactor->csrMat->row_offsets->data().get(),
794                           loTriFactor->csrMat->column_indices->data().get(),
795                           loTriFactorT->csrMat->values->data().get(),
796                           loTriFactorT->csrMat->column_indices->data().get(),
797                           loTriFactorT->csrMat->row_offsets->data().get(),
798                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat);
799 
800   /* perform the solve analysis on the transposed matrix */
801   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
802                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries,
803                            loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
804                            loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(),
805                            loTriFactorT->solveInfo);CHKERRCUSP(stat);
806 
807   /* assign the pointer. Is this really necessary? */
808   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
809 
810   /*********************************************/
811   /* Now the Transpose of the Upper Tri Factor */
812   /*********************************************/
813 
814   /* allocate space for the transpose of the upper triangular factor */
815   upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
816 
817   /* set the matrix descriptors of the upper triangular factor */
818   matrixType = cusparseGetMatType(upTriFactor->descr);
819   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
820   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
821     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
822   diagType = cusparseGetMatDiagType(upTriFactor->descr);
823 
824   /* Create the matrix description */
825   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSP(stat);
826   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSP(stat);
827   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSP(stat);
828   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSP(stat);
829   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSP(stat);
830 
831   /* Create the solve analysis information */
832   stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUSP(stat);
833 
834   /* set the operation */
835   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
836 
837   /* allocate GPU space for the CSC of the upper triangular factor*/
838   upTriFactorT->csrMat = new CsrMatrix;
839   upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows;
840   upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols;
841   upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
842   upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1);
843   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries);
844   upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries);
845 
846   /* compute the transpose of the upper triangular factor, i.e. the CSC */
847   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
848                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
849                           upTriFactor->csrMat->values->data().get(),
850                           upTriFactor->csrMat->row_offsets->data().get(),
851                           upTriFactor->csrMat->column_indices->data().get(),
852                           upTriFactorT->csrMat->values->data().get(),
853                           upTriFactorT->csrMat->column_indices->data().get(),
854                           upTriFactorT->csrMat->row_offsets->data().get(),
855                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat);
856 
857   /* perform the solve analysis on the transposed matrix */
858   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
859                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries,
860                            upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
861                            upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(),
862                            upTriFactorT->solveInfo);CHKERRCUSP(stat);
863 
864   /* assign the pointer. Is this really necessary? */
865   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
866   PetscFunctionReturn(0);
867 }
868 
869 #undef __FUNCT__
870 #define __FUNCT__ "MatSeqAIJCUSPARSEGenerateTransposeForMult"
871 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
872 {
873   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
874   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
875   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
876   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
877   cusparseStatus_t             stat;
878   cusparseIndexBase_t          indexBase;
879   cudaError_t                  err;
880 
881   PetscFunctionBegin;
882 
883   /* allocate space for the triangular factor information */
884   matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
885   stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSP(stat);
886   indexBase = cusparseGetMatIndexBase(matstruct->descr);
887   stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSP(stat);
888   stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSP(stat);
889 
890   /* set alpha and beta */
891   err = cudaMalloc((void **)&(matstructT->alpha),sizeof(PetscScalar));CHKERRCUSP(err);
892   err = cudaMemcpy(matstructT->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err);
893   err = cudaMalloc((void **)&(matstructT->beta),sizeof(PetscScalar));CHKERRCUSP(err);
894   err = cudaMemcpy(matstructT->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err);
895   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSP(stat);
896 
897   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
898     CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
899     CsrMatrix *matrixT= new CsrMatrix;
900     matrixT->num_rows = A->rmap->n;
901     matrixT->num_cols = A->cmap->n;
902     matrixT->num_entries = a->nz;
903     matrixT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
904     matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
905     matrixT->values = new THRUSTARRAY(a->nz);
906 
907     /* compute the transpose of the upper triangular factor, i.e. the CSC */
908     indexBase = cusparseGetMatIndexBase(matstruct->descr);
909     stat = cusparse_csr2csc(cusparsestruct->handle, matrix->num_rows,
910                             matrix->num_cols, matrix->num_entries,
911                             matrix->values->data().get(),
912                             matrix->row_offsets->data().get(),
913                             matrix->column_indices->data().get(),
914                             matrixT->values->data().get(),
915                             matrixT->column_indices->data().get(),
916                             matrixT->row_offsets->data().get(),
917                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat);
918 
919     /* assign the pointer */
920     matstructT->mat = matrixT;
921 
922   } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
923 #if CUDA_VERSION>=5000
924     /* First convert HYB to CSR */
925     CsrMatrix *temp= new CsrMatrix;
926     temp->num_rows = A->rmap->n;
927     temp->num_cols = A->cmap->n;
928     temp->num_entries = a->nz;
929     temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
930     temp->column_indices = new THRUSTINTARRAY32(a->nz);
931     temp->values = new THRUSTARRAY(a->nz);
932 
933 
934     stat = cusparse_hyb2csr(cusparsestruct->handle,
935                             matstruct->descr, (cusparseHybMat_t)matstruct->mat,
936                             temp->values->data().get(),
937                             temp->row_offsets->data().get(),
938                             temp->column_indices->data().get());CHKERRCUSP(stat);
939 
940     /* Next, convert CSR to CSC (i.e. the matrix transpose) */
941     CsrMatrix *tempT= new CsrMatrix;
942     tempT->num_rows = A->rmap->n;
943     tempT->num_cols = A->cmap->n;
944     tempT->num_entries = a->nz;
945     tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
946     tempT->column_indices = new THRUSTINTARRAY32(a->nz);
947     tempT->values = new THRUSTARRAY(a->nz);
948 
949     stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
950                             temp->num_cols, temp->num_entries,
951                             temp->values->data().get(),
952                             temp->row_offsets->data().get(),
953                             temp->column_indices->data().get(),
954                             tempT->values->data().get(),
955                             tempT->column_indices->data().get(),
956                             tempT->row_offsets->data().get(),
957                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat);
958 
959     /* Last, convert CSC to HYB */
960     cusparseHybMat_t hybMat;
961     stat = cusparseCreateHybMat(&hybMat);CHKERRCUSP(stat);
962     cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
963       CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
964     stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
965                             matstructT->descr, tempT->values->data().get(),
966                             tempT->row_offsets->data().get(),
967                             tempT->column_indices->data().get(),
968                             hybMat, 0, partition);CHKERRCUSP(stat);
969 
970     /* assign the pointer */
971     matstructT->mat = hybMat;
972 
973     /* delete temporaries */
974     if (tempT) {
975       if (tempT->values) delete (THRUSTARRAY*) tempT->values;
976       if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
977       if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
978       delete (CsrMatrix*) tempT;
979     }
980     if (temp) {
981       if (temp->values) delete (THRUSTARRAY*) temp->values;
982       if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
983       if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
984       delete (CsrMatrix*) temp;
985     }
986 #else
987     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format for the Matrix Transpose (in MatMultTranspose) require CUDA 5.0 or later.");
988 #endif
989   }
990   /* assign the compressed row indices */
991   matstructT->cprowIndices = new THRUSTINTARRAY;
992 
993   /* assign the pointer */
994   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
995   PetscFunctionReturn(0);
996 }
997 
998 #undef __FUNCT__
999 #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE"
1000 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1001 {
1002   CUSPARRAY                         *xGPU, *bGPU;
1003   cusparseStatus_t                  stat;
1004   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1005   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1006   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1007   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1008   PetscErrorCode                    ierr;
1009 
1010   PetscFunctionBegin;
1011   /* Analyze the matrix and create the transpose ... on the fly */
1012   if (!loTriFactorT && !upTriFactorT) {
1013     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1014     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1015     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1016   }
1017 
1018   /* Get the GPU pointers */
1019   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1020   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
1021 
1022   /* First, reorder with the row permutation */
1023   thrust::copy(thrust::make_permutation_iterator(bGPU->begin(), cusparseTriFactors->rpermIndices->begin()),
1024                thrust::make_permutation_iterator(bGPU->end(), cusparseTriFactors->rpermIndices->end()),
1025                xGPU->begin());
1026 
1027   /* First, solve U */
1028   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1029                         upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr,
1030                         upTriFactorT->csrMat->values->data().get(),
1031                         upTriFactorT->csrMat->row_offsets->data().get(),
1032                         upTriFactorT->csrMat->column_indices->data().get(),
1033                         upTriFactorT->solveInfo,
1034                         xGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat);
1035 
1036   /* Then, solve L */
1037   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1038                         loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr,
1039                         loTriFactorT->csrMat->values->data().get(),
1040                         loTriFactorT->csrMat->row_offsets->data().get(),
1041                         loTriFactorT->csrMat->column_indices->data().get(),
1042                         loTriFactorT->solveInfo,
1043                         tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat);
1044 
1045   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1046   thrust::copy(thrust::make_permutation_iterator(xGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1047                thrust::make_permutation_iterator(xGPU->end(), cusparseTriFactors->cpermIndices->end()),
1048                tempGPU->begin());
1049 
1050   /* Copy the temporary to the full solution. */
1051   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU->begin());
1052 
1053   /* restore */
1054   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
1055   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1056   ierr = WaitForGPU();CHKERRCUSP(ierr);
1057 
1058   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1059   PetscFunctionReturn(0);
1060 }
1061 
1062 #undef __FUNCT__
1063 #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering"
1064 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1065 {
1066   CUSPARRAY                         *xGPU,*bGPU;
1067   cusparseStatus_t                  stat;
1068   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1069   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1070   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1071   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1072   PetscErrorCode                    ierr;
1073 
1074   PetscFunctionBegin;
1075   /* Analyze the matrix and create the transpose ... on the fly */
1076   if (!loTriFactorT && !upTriFactorT) {
1077     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1078     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1079     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1080   }
1081 
1082   /* Get the GPU pointers */
1083   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1084   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
1085 
1086   /* First, solve U */
1087   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1088                         upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr,
1089                         upTriFactorT->csrMat->values->data().get(),
1090                         upTriFactorT->csrMat->row_offsets->data().get(),
1091                         upTriFactorT->csrMat->column_indices->data().get(),
1092                         upTriFactorT->solveInfo,
1093                         bGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat);
1094 
1095   /* Then, solve L */
1096   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1097                         loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr,
1098                         loTriFactorT->csrMat->values->data().get(),
1099                         loTriFactorT->csrMat->row_offsets->data().get(),
1100                         loTriFactorT->csrMat->column_indices->data().get(),
1101                         loTriFactorT->solveInfo,
1102                         tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat);
1103 
1104   /* restore */
1105   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
1106   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1107   ierr = WaitForGPU();CHKERRCUSP(ierr);
1108   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1109   PetscFunctionReturn(0);
1110 }
1111 
1112 #undef __FUNCT__
1113 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE"
1114 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1115 {
1116   CUSPARRAY                         *xGPU,*bGPU;
1117   cusparseStatus_t                  stat;
1118   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1119   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1120   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1121   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1122   PetscErrorCode                    ierr;
1123   VecType                           t;
1124   PetscBool                         flg;
1125 
1126   PetscFunctionBegin;
1127   ierr = VecGetType(bb,&t);CHKERRQ(ierr);
1128   ierr = PetscStrcmp(t,VECSEQCUSP,&flg);CHKERRQ(ierr);
1129   if (!flg) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Vector of type %s passed into MatSolve_SeqAIJCUSPARSE (Arg #2). Can only deal with %s\n.",t,VECSEQCUSP);
1130   ierr = VecGetType(xx,&t);CHKERRQ(ierr);
1131   ierr = PetscStrcmp(t,VECSEQCUSP,&flg);CHKERRQ(ierr);
1132   if (!flg) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Vector of type %s passed into MatSolve_SeqAIJCUSPARSE (Arg #3). Can only deal with %s\n.",t,VECSEQCUSP);
1133 
1134   /* Get the GPU pointers */
1135   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1136   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
1137 
1138   /* First, reorder with the row permutation */
1139   thrust::copy(thrust::make_permutation_iterator(bGPU->begin(), cusparseTriFactors->rpermIndices->begin()),
1140                thrust::make_permutation_iterator(bGPU->end(), cusparseTriFactors->rpermIndices->end()),
1141                xGPU->begin());
1142 
1143   /* Next, solve L */
1144   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1145                         loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr,
1146                         loTriFactor->csrMat->values->data().get(),
1147                         loTriFactor->csrMat->row_offsets->data().get(),
1148                         loTriFactor->csrMat->column_indices->data().get(),
1149                         loTriFactor->solveInfo,
1150                         xGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat);
1151 
1152   /* Then, solve U */
1153   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1154                         upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr,
1155                         upTriFactor->csrMat->values->data().get(),
1156                         upTriFactor->csrMat->row_offsets->data().get(),
1157                         upTriFactor->csrMat->column_indices->data().get(),
1158                         upTriFactor->solveInfo,
1159                         tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat);
1160 
1161   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1162   thrust::copy(thrust::make_permutation_iterator(xGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1163                thrust::make_permutation_iterator(xGPU->end(), cusparseTriFactors->cpermIndices->end()),
1164                tempGPU->begin());
1165 
1166   /* Copy the temporary to the full solution. */
1167   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU->begin());
1168 
1169   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
1170   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1171   ierr = WaitForGPU();CHKERRCUSP(ierr);
1172   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1173   PetscFunctionReturn(0);
1174 }
1175 
1176 #undef __FUNCT__
1177 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE_NaturalOrdering"
1178 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1179 {
1180   CUSPARRAY                         *xGPU,*bGPU;
1181   cusparseStatus_t                  stat;
1182   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1183   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1184   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1185   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1186   PetscErrorCode                    ierr;
1187 
1188   PetscFunctionBegin;
1189   /* Get the GPU pointers */
1190   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1191   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
1192 
1193   /* First, solve L */
1194   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1195                         loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr,
1196                         loTriFactor->csrMat->values->data().get(),
1197                         loTriFactor->csrMat->row_offsets->data().get(),
1198                         loTriFactor->csrMat->column_indices->data().get(),
1199                         loTriFactor->solveInfo,
1200                         bGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat);
1201 
1202   /* Next, solve U */
1203   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1204                         upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr,
1205                         upTriFactor->csrMat->values->data().get(),
1206                         upTriFactor->csrMat->row_offsets->data().get(),
1207                         upTriFactor->csrMat->column_indices->data().get(),
1208                         upTriFactor->solveInfo,
1209                         tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat);
1210 
1211   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
1212   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1213   ierr = WaitForGPU();CHKERRCUSP(ierr);
1214   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1215   PetscFunctionReturn(0);
1216 }
1217 
1218 #undef __FUNCT__
1219 #define __FUNCT__ "MatSeqAIJCUSPARSECopyToGPU"
1220 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1221 {
1222 
1223   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1224   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1225   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1226   PetscInt                     m = A->rmap->n,*ii,*ridx;
1227   PetscErrorCode               ierr;
1228   cusparseStatus_t             stat;
1229   cudaError_t                  err;
1230 
1231   PetscFunctionBegin;
1232   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
1233     ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1234     Mat_SeqAIJCUSPARSEMultStruct_Destroy(&matstruct,cusparsestruct->format);
1235     try {
1236       cusparsestruct->nonzerorow=0;
1237       for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0);
1238 
1239       if (a->compressedrow.use) {
1240         m    = a->compressedrow.nrows;
1241         ii   = a->compressedrow.i;
1242         ridx = a->compressedrow.rindex;
1243       } else {
1244         /* Forcing compressed row on the GPU */
1245         int k=0;
1246         ierr = PetscMalloc1(cusparsestruct->nonzerorow+1, &ii);CHKERRQ(ierr);
1247         ierr = PetscMalloc1(cusparsestruct->nonzerorow, &ridx);CHKERRQ(ierr);
1248         ii[0]=0;
1249         for (int j = 0; j<m; j++) {
1250           if ((a->i[j+1]-a->i[j])>0) {
1251             ii[k]  = a->i[j];
1252             ridx[k]= j;
1253             k++;
1254           }
1255         }
1256         ii[cusparsestruct->nonzerorow] = a->nz;
1257         m = cusparsestruct->nonzerorow;
1258       }
1259 
1260       /* allocate space for the triangular factor information */
1261       matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1262       stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSP(stat);
1263       stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat);
1264       stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSP(stat);
1265 
1266       err = cudaMalloc((void **)&(matstruct->alpha),sizeof(PetscScalar));CHKERRCUSP(err);
1267       err = cudaMemcpy(matstruct->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err);
1268       err = cudaMalloc((void **)&(matstruct->beta),sizeof(PetscScalar));CHKERRCUSP(err);
1269       err = cudaMemcpy(matstruct->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err);
1270       stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSP(stat);
1271 
1272       /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1273       if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1274 /* set the matrix */
1275         CsrMatrix *matrix= new CsrMatrix;
1276         matrix->num_rows = m;
1277         matrix->num_cols = A->cmap->n;
1278         matrix->num_entries = a->nz;
1279         matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1280         matrix->row_offsets->assign(ii, ii + m+1);
1281 
1282         matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1283         matrix->column_indices->assign(a->j, a->j+a->nz);
1284 
1285         matrix->values = new THRUSTARRAY(a->nz);
1286         matrix->values->assign(a->a, a->a+a->nz);
1287 
1288 /* assign the pointer */
1289         matstruct->mat = matrix;
1290 
1291       } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1292 #if CUDA_VERSION>=4020
1293         CsrMatrix *matrix= new CsrMatrix;
1294         matrix->num_rows = m;
1295         matrix->num_cols = A->cmap->n;
1296         matrix->num_entries = a->nz;
1297         matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1298         matrix->row_offsets->assign(ii, ii + m+1);
1299 
1300         matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1301         matrix->column_indices->assign(a->j, a->j+a->nz);
1302 
1303         matrix->values = new THRUSTARRAY(a->nz);
1304         matrix->values->assign(a->a, a->a+a->nz);
1305 
1306         cusparseHybMat_t hybMat;
1307         stat = cusparseCreateHybMat(&hybMat);CHKERRCUSP(stat);
1308         cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1309           CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1310         stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols,
1311                                 matstruct->descr, matrix->values->data().get(),
1312                                 matrix->row_offsets->data().get(),
1313                                 matrix->column_indices->data().get(),
1314                                 hybMat, 0, partition);CHKERRCUSP(stat);
1315         /* assign the pointer */
1316         matstruct->mat = hybMat;
1317 
1318         if (matrix) {
1319           if (matrix->values) delete (THRUSTARRAY*)matrix->values;
1320           if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices;
1321           if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets;
1322           delete (CsrMatrix*)matrix;
1323         }
1324 #endif
1325       }
1326 
1327       /* assign the compressed row indices */
1328       matstruct->cprowIndices = new THRUSTINTARRAY(m);
1329       matstruct->cprowIndices->assign(ridx,ridx+m);
1330 
1331       /* assign the pointer */
1332       cusparsestruct->mat = matstruct;
1333 
1334       if (!a->compressedrow.use) {
1335         ierr = PetscFree(ii);CHKERRQ(ierr);
1336         ierr = PetscFree(ridx);CHKERRQ(ierr);
1337       }
1338       cusparsestruct->workVector = new THRUSTARRAY;
1339       cusparsestruct->workVector->resize(m);
1340     } catch(char *ex) {
1341       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1342     }
1343     ierr = WaitForGPU();CHKERRCUSP(ierr);
1344 
1345     A->valid_GPU_matrix = PETSC_CUSP_BOTH;
1346 
1347     ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1348   }
1349   PetscFunctionReturn(0);
1350 }
1351 
1352 #undef __FUNCT__
1353 #define __FUNCT__ "MatCreateVecs_SeqAIJCUSPARSE"
1354 static PetscErrorCode MatCreateVecs_SeqAIJCUSPARSE(Mat mat, Vec *right, Vec *left)
1355 {
1356   PetscErrorCode ierr;
1357   PetscInt rbs,cbs;
1358 
1359   PetscFunctionBegin;
1360   ierr = MatGetBlockSizes(mat,&rbs,&cbs);CHKERRQ(ierr);
1361   if (right) {
1362     ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr);
1363     ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
1364     ierr = VecSetBlockSize(*right,cbs);CHKERRQ(ierr);
1365     ierr = VecSetType(*right,VECSEQCUSP);CHKERRQ(ierr);
1366     ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr);
1367   }
1368   if (left) {
1369     ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr);
1370     ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
1371     ierr = VecSetBlockSize(*left,rbs);CHKERRQ(ierr);
1372     ierr = VecSetType(*left,VECSEQCUSP);CHKERRQ(ierr);
1373     ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr);
1374   }
1375   PetscFunctionReturn(0);
1376 }
1377 
1378 struct VecCUSPPlusEquals
1379 {
1380   template <typename Tuple>
1381   __host__ __device__
1382   void operator()(Tuple t)
1383   {
1384     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1385   }
1386 };
1387 
1388 #undef __FUNCT__
1389 #define __FUNCT__ "MatMult_SeqAIJCUSPARSE"
1390 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1391 {
1392   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1393   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1394   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1395   CUSPARRAY                    *xarray,*yarray;
1396   PetscErrorCode               ierr;
1397   cusparseStatus_t             stat;
1398 
1399   PetscFunctionBegin;
1400   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
1401      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
1402   ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1403   ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
1404   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1405     CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1406     stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1407                              mat->num_rows, mat->num_cols, mat->num_entries,
1408                              matstruct->alpha, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(),
1409                              mat->column_indices->data().get(), xarray->data().get(), matstruct->beta,
1410                              yarray->data().get());CHKERRCUSP(stat);
1411   } else {
1412 #if CUDA_VERSION>=4020
1413     cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1414     stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1415                              matstruct->alpha, matstruct->descr, hybMat,
1416                              xarray->data().get(), matstruct->beta,
1417                              yarray->data().get());CHKERRCUSP(stat);
1418 #endif
1419   }
1420   ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1421   ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
1422   if (!cusparsestruct->stream) {
1423     ierr = WaitForGPU();CHKERRCUSP(ierr);
1424   }
1425   ierr = PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr);
1426   PetscFunctionReturn(0);
1427 }
1428 
1429 #undef __FUNCT__
1430 #define __FUNCT__ "MatMultTranspose_SeqAIJCUSPARSE"
1431 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1432 {
1433   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1434   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1435   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1436   CUSPARRAY                    *xarray,*yarray;
1437   PetscErrorCode               ierr;
1438   cusparseStatus_t             stat;
1439 
1440   PetscFunctionBegin;
1441   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
1442      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
1443   if (!matstructT) {
1444     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1445     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1446   }
1447   ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1448   ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
1449 
1450   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1451     CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1452     stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1453                              mat->num_rows, mat->num_cols,
1454                              mat->num_entries, matstructT->alpha, matstructT->descr,
1455                              mat->values->data().get(), mat->row_offsets->data().get(),
1456                              mat->column_indices->data().get(), xarray->data().get(), matstructT->beta,
1457                              yarray->data().get());CHKERRCUSP(stat);
1458   } else {
1459 #if CUDA_VERSION>=4020
1460     cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1461     stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1462                              matstructT->alpha, matstructT->descr, hybMat,
1463                              xarray->data().get(), matstructT->beta,
1464                              yarray->data().get());CHKERRCUSP(stat);
1465 #endif
1466   }
1467   ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1468   ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
1469   if (!cusparsestruct->stream) {
1470     ierr = WaitForGPU();CHKERRCUSP(ierr);
1471   }
1472   ierr = PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr);
1473   PetscFunctionReturn(0);
1474 }
1475 
1476 
1477 #undef __FUNCT__
1478 #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE"
1479 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1480 {
1481   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1482   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1483   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1484   CUSPARRAY                    *xarray,*yarray,*zarray;
1485   PetscErrorCode               ierr;
1486   cusparseStatus_t             stat;
1487 
1488   PetscFunctionBegin;
1489   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
1490      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
1491   try {
1492     ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr);
1493     ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1494     ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr);
1495     ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
1496 
1497     /* multiply add */
1498     if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1499       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1500     /* here we need to be careful to set the number of rows in the multiply to the
1501        number of compressed rows in the matrix ... which is equivalent to the
1502        size of the workVector */
1503       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1504                                mat->num_rows, mat->num_cols,
1505                                mat->num_entries, matstruct->alpha, matstruct->descr,
1506                                mat->values->data().get(), mat->row_offsets->data().get(),
1507                                mat->column_indices->data().get(), xarray->data().get(), matstruct->beta,
1508                                cusparsestruct->workVector->data().get());CHKERRCUSP(stat);
1509     } else {
1510 #if CUDA_VERSION>=4020
1511       cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1512       if (cusparsestruct->workVector->size()) {
1513         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1514             matstruct->alpha, matstruct->descr, hybMat,
1515             xarray->data().get(), matstruct->beta,
1516             cusparsestruct->workVector->data().get());CHKERRCUSP(stat);
1517       }
1518 #endif
1519     }
1520 
1521     /* scatter the data from the temporary into the full vector with a += operation */
1522     thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstruct->cprowIndices->begin()))),
1523         thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(),
1524         VecCUSPPlusEquals());
1525     ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1526     ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr);
1527     ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
1528 
1529   } catch(char *ex) {
1530     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1531   }
1532   ierr = WaitForGPU();CHKERRCUSP(ierr);
1533   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1534   PetscFunctionReturn(0);
1535 }
1536 
1537 #undef __FUNCT__
1538 #define __FUNCT__ "MatMultTransposeAdd_SeqAIJCUSPARSE"
1539 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1540 {
1541   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1542   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1543   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1544   CUSPARRAY                    *xarray,*yarray,*zarray;
1545   PetscErrorCode               ierr;
1546   cusparseStatus_t             stat;
1547 
1548   PetscFunctionBegin;
1549   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
1550      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
1551   if (!matstructT) {
1552     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1553     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1554   }
1555 
1556   try {
1557     ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr);
1558     ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1559     ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr);
1560     ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
1561 
1562     /* multiply add with matrix transpose */
1563     if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1564       CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1565       /* here we need to be careful to set the number of rows in the multiply to the
1566          number of compressed rows in the matrix ... which is equivalent to the
1567          size of the workVector */
1568       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1569                                mat->num_rows, mat->num_cols,
1570                                mat->num_entries, matstructT->alpha, matstructT->descr,
1571                                mat->values->data().get(), mat->row_offsets->data().get(),
1572                                mat->column_indices->data().get(), xarray->data().get(), matstructT->beta,
1573                                cusparsestruct->workVector->data().get());CHKERRCUSP(stat);
1574     } else {
1575 #if CUDA_VERSION>=4020
1576       cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1577       if (cusparsestruct->workVector->size()) {
1578         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1579             matstructT->alpha, matstructT->descr, hybMat,
1580             xarray->data().get(), matstructT->beta,
1581             cusparsestruct->workVector->data().get());CHKERRCUSP(stat);
1582       }
1583 #endif
1584     }
1585 
1586     /* scatter the data from the temporary into the full vector with a += operation */
1587     thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstructT->cprowIndices->begin()))),
1588         thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstructT->cprowIndices->begin()))) + cusparsestruct->workVector->size(),
1589         VecCUSPPlusEquals());
1590 
1591     ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1592     ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr);
1593     ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
1594 
1595   } catch(char *ex) {
1596     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1597   }
1598   ierr = WaitForGPU();CHKERRCUSP(ierr);
1599   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1600   PetscFunctionReturn(0);
1601 }
1602 
1603 #undef __FUNCT__
1604 #define __FUNCT__ "MatAssemblyEnd_SeqAIJCUSPARSE"
1605 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
1606 {
1607   PetscErrorCode ierr;
1608 
1609   PetscFunctionBegin;
1610   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
1611   if (A->factortype==MAT_FACTOR_NONE) {
1612     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1613   }
1614   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
1615   A->ops->mult             = MatMult_SeqAIJCUSPARSE;
1616   A->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1617   A->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1618   A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1619   PetscFunctionReturn(0);
1620 }
1621 
1622 /* --------------------------------------------------------------------------------*/
1623 #undef __FUNCT__
1624 #define __FUNCT__ "MatCreateSeqAIJCUSPARSE"
1625 /*@
1626    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
1627    (the default parallel PETSc format). This matrix will ultimately pushed down
1628    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
1629    assembly performance the user should preallocate the matrix storage by setting
1630    the parameter nz (or the array nnz).  By setting these parameters accurately,
1631    performance during matrix assembly can be increased by more than a factor of 50.
1632 
1633    Collective on MPI_Comm
1634 
1635    Input Parameters:
1636 +  comm - MPI communicator, set to PETSC_COMM_SELF
1637 .  m - number of rows
1638 .  n - number of columns
1639 .  nz - number of nonzeros per row (same for all rows)
1640 -  nnz - array containing the number of nonzeros in the various rows
1641          (possibly different for each row) or NULL
1642 
1643    Output Parameter:
1644 .  A - the matrix
1645 
1646    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1647    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1648    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
1649 
1650    Notes:
1651    If nnz is given then nz is ignored
1652 
1653    The AIJ format (also called the Yale sparse matrix format or
1654    compressed row storage), is fully compatible with standard Fortran 77
1655    storage.  That is, the stored row and column indices can begin at
1656    either one (as in Fortran) or zero.  See the users' manual for details.
1657 
1658    Specify the preallocated storage with either nz or nnz (not both).
1659    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1660    allocation.  For large problems you MUST preallocate memory or you
1661    will get TERRIBLE performance, see the users' manual chapter on matrices.
1662 
1663    By default, this format uses inodes (identical nodes) when possible, to
1664    improve numerical efficiency of matrix-vector products and solves. We
1665    search for consecutive rows with the same nonzero structure, thereby
1666    reusing matrix information to achieve increased efficiency.
1667 
1668    Level: intermediate
1669 
1670 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
1671 @*/
1672 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1673 {
1674   PetscErrorCode ierr;
1675 
1676   PetscFunctionBegin;
1677   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1678   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
1679   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1680   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
1681   PetscFunctionReturn(0);
1682 }
1683 
1684 #undef __FUNCT__
1685 #define __FUNCT__ "MatDestroy_SeqAIJCUSPARSE"
1686 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
1687 {
1688   PetscErrorCode   ierr;
1689 
1690   PetscFunctionBegin;
1691   if (A->factortype==MAT_FACTOR_NONE) {
1692     if (A->valid_GPU_matrix != PETSC_CUSP_UNALLOCATED) {
1693       ierr = Mat_SeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
1694     }
1695   } else {
1696     ierr = Mat_SeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
1697   }
1698   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
1699   PetscFunctionReturn(0);
1700 }
1701 
1702 #undef __FUNCT__
1703 #define __FUNCT__ "MatCreate_SeqAIJCUSPARSE"
1704 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
1705 {
1706   PetscErrorCode ierr;
1707   cusparseStatus_t stat;
1708   cusparseHandle_t handle=0;
1709 
1710   PetscFunctionBegin;
1711   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
1712   if (B->factortype==MAT_FACTOR_NONE) {
1713     /* you cannot check the inode.use flag here since the matrix was just created.
1714        now build a GPU matrix data structure */
1715     B->spptr = new Mat_SeqAIJCUSPARSE;
1716     ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat          = 0;
1717     ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0;
1718     ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector   = 0;
1719     ((Mat_SeqAIJCUSPARSE*)B->spptr)->format       = MAT_CUSPARSE_CSR;
1720     ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream       = 0;
1721     ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle       = 0;
1722     stat = cusparseCreate(&handle);CHKERRCUSP(stat);
1723     ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle       = handle;
1724     ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream       = 0;
1725   } else {
1726     /* NEXT, set the pointers to the triangular factors */
1727     B->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1728     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr          = 0;
1729     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr          = 0;
1730     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0;
1731     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0;
1732     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices            = 0;
1733     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices            = 0;
1734     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector              = 0;
1735     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle                  = 0;
1736     stat = cusparseCreate(&handle);CHKERRCUSP(stat);
1737     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle                  = handle;
1738     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz                     = 0;
1739   }
1740 
1741   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
1742   B->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
1743   B->ops->getvecs          = MatCreateVecs_SeqAIJCUSPARSE;
1744   B->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
1745   B->ops->mult             = MatMult_SeqAIJCUSPARSE;
1746   B->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1747   B->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1748   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1749 
1750   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1751 
1752   B->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED;
1753 
1754   ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
1755   PetscFunctionReturn(0);
1756 }
1757 
1758 /*M
1759    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
1760 
1761    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
1762    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
1763    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
1764 
1765    Options Database Keys:
1766 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
1767 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1768 .  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1769 
1770   Level: beginner
1771 
1772 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
1773 M*/
1774 
1775 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);
1776 
1777 
1778 #undef __FUNCT__
1779 #define __FUNCT__ "MatSolverPackageRegister_CUSPARSE"
1780 PETSC_EXTERN PetscErrorCode MatSolverPackageRegister_CUSPARSE(void)
1781 {
1782   PetscErrorCode ierr;
1783 
1784   PetscFunctionBegin;
1785   ierr = MatSolverPackageRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1786   ierr = MatSolverPackageRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1787   ierr = MatSolverPackageRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1788   ierr = MatSolverPackageRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1789   PetscFunctionReturn(0);
1790 }
1791 
1792 
1793 #undef __FUNCT__
1794 #define __FUNCT__ "Mat_SeqAIJCUSPARSE_Destroy"
1795 static PetscErrorCode Mat_SeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
1796 {
1797   cusparseStatus_t stat;
1798   cusparseHandle_t handle;
1799 
1800   PetscFunctionBegin;
1801   if (*cusparsestruct) {
1802     Mat_SeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);
1803     Mat_SeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);
1804     delete (*cusparsestruct)->workVector;
1805     if (handle = (*cusparsestruct)->handle) {
1806       stat = cusparseDestroy(handle);CHKERRCUSP(stat);
1807     }
1808     delete *cusparsestruct;
1809     *cusparsestruct = 0;
1810   }
1811   PetscFunctionReturn(0);
1812 }
1813 
1814 #undef __FUNCT__
1815 #define __FUNCT__ "CsrMatrix_Destroy"
1816 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
1817 {
1818   PetscFunctionBegin;
1819   if (*mat) {
1820     delete (*mat)->values;
1821     delete (*mat)->column_indices;
1822     delete (*mat)->row_offsets;
1823     delete *mat;
1824     *mat = 0;
1825   }
1826   PetscFunctionReturn(0);
1827 }
1828 
1829 #undef __FUNCT__
1830 #define __FUNCT__ "Mat_SeqAIJCUSPARSETriFactorStruct_Destroy"
1831 static PetscErrorCode Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
1832 {
1833   cusparseStatus_t stat;
1834   PetscErrorCode   ierr;
1835 
1836   PetscFunctionBegin;
1837   if (*trifactor) {
1838     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSP(stat); }
1839     if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUSP(stat); }
1840     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
1841     delete *trifactor;
1842     *trifactor = 0;
1843   }
1844   PetscFunctionReturn(0);
1845 }
1846 
1847 #undef __FUNCT__
1848 #define __FUNCT__ "Mat_SeqAIJCUSPARSEMultStruct_Destroy"
1849 static PetscErrorCode Mat_SeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
1850 {
1851   CsrMatrix        *mat;
1852   cusparseStatus_t stat;
1853   cudaError_t      err;
1854 
1855   PetscFunctionBegin;
1856   if (*matstruct) {
1857     if ((*matstruct)->mat) {
1858       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
1859         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
1860         stat = cusparseDestroyHybMat(hybMat);CHKERRCUSP(stat);
1861       } else {
1862         mat = (CsrMatrix*)(*matstruct)->mat;
1863         CsrMatrix_Destroy(&mat);
1864       }
1865     }
1866     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSP(stat); }
1867     delete (*matstruct)->cprowIndices;
1868     if ((*matstruct)->alpha) { err=cudaFree((*matstruct)->alpha);CHKERRCUSP(err); }
1869     if ((*matstruct)->beta) { err=cudaFree((*matstruct)->beta);CHKERRCUSP(err); }
1870     delete *matstruct;
1871     *matstruct = 0;
1872   }
1873   PetscFunctionReturn(0);
1874 }
1875 
1876 #undef __FUNCT__
1877 #define __FUNCT__ "Mat_SeqAIJCUSPARSETriFactors_Destroy"
1878 static PetscErrorCode Mat_SeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
1879 {
1880   cusparseHandle_t handle;
1881   cusparseStatus_t stat;
1882 
1883   PetscFunctionBegin;
1884   if (*trifactors) {
1885     Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->loTriFactorPtr);
1886     Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->upTriFactorPtr);
1887     Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);
1888     Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);
1889     delete (*trifactors)->rpermIndices;
1890     delete (*trifactors)->cpermIndices;
1891     delete (*trifactors)->workVector;
1892     if (handle = (*trifactors)->handle) {
1893       stat = cusparseDestroy(handle);CHKERRCUSP(stat);
1894     }
1895     delete *trifactors;
1896     *trifactors = 0;
1897   }
1898   PetscFunctionReturn(0);
1899 }
1900 
1901