xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision ab25e6cbe03f3fa4fa82daf2a11a757d3a4c81c2)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 
6 #include <petscconf.h>
7 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
8 #include <../src/mat/impls/sbaij/seq/sbaij.h>
9 #include <../src/vec/vec/impls/dvecimpl.h>
10 #include <petsc-private/vecimpl.h>
11 #undef VecType
12 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
13 
14 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
15 
16 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
17 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
18 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
19 
20 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
21 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
22 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
23 
24 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
25 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
26 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
27 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
28 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat);
29 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
30 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
31 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
32 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
33 
34 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
35 static PetscErrorCode Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
36 static PetscErrorCode Mat_SeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
37 static PetscErrorCode Mat_SeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
38 static PetscErrorCode Mat_SeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
39 
40 #undef __FUNCT__
41 #define __FUNCT__ "MatCUSPARSESetStream"
42 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
43 {
44   cusparseStatus_t   stat;
45   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
46 
47   PetscFunctionBegin;
48   cusparsestruct->stream = stream;
49   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSP(stat);
50   PetscFunctionReturn(0);
51 }
52 
53 #undef __FUNCT__
54 #define __FUNCT__ "MatCUSPARSESetHandle"
55 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
56 {
57   cusparseStatus_t   stat;
58   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
59 
60   PetscFunctionBegin;
61   if (cusparsestruct->handle)
62     stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSP(stat);
63   cusparsestruct->handle = handle;
64   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSP(stat);
65   PetscFunctionReturn(0);
66 }
67 
68 #undef __FUNCT__
69 #define __FUNCT__ "MatCUSPARSEClearHandle"
70 PetscErrorCode MatCUSPARSEClearHandle(Mat A)
71 {
72   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
73   PetscFunctionBegin;
74   if (cusparsestruct->handle)
75     cusparsestruct->handle = 0;
76   PetscFunctionReturn(0);
77 }
78 
79 #undef __FUNCT__
80 #define __FUNCT__ "MatFactorGetSolverPackage_seqaij_cusparse"
81 PetscErrorCode MatFactorGetSolverPackage_seqaij_cusparse(Mat A,const MatSolverPackage *type)
82 {
83   PetscFunctionBegin;
84   *type = MATSOLVERCUSPARSE;
85   PetscFunctionReturn(0);
86 }
87 
88 /*MC
89   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
90   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
91   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
92   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
93   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
94   algorithms are not recommended. This class does NOT support direct solver operations.
95 
96   Level: beginner
97 
98 .seealso: PCFactorSetMatSolverPackage(), MatSolverPackage, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
99 M*/
100 
101 #undef __FUNCT__
102 #define __FUNCT__ "MatGetFactor_seqaij_cusparse"
103 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaij_cusparse(Mat A,MatFactorType ftype,Mat *B)
104 {
105   PetscErrorCode ierr;
106   PetscInt       n = A->rmap->n;
107 
108   PetscFunctionBegin;
109   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
110   (*B)->factortype = ftype;
111   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
112   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
113 
114   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
115     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
116     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
117     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
118   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
119     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
120     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
121   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
122 
123   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
124   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverPackage_C",MatFactorGetSolverPackage_seqaij_cusparse);CHKERRQ(ierr);
125   PetscFunctionReturn(0);
126 }
127 
128 #undef __FUNCT__
129 #define __FUNCT__ "MatCUSPARSESetFormat_SeqAIJCUSPARSE"
130 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
131 {
132   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
133 
134   PetscFunctionBegin;
135 #if CUDA_VERSION>=4020
136   switch (op) {
137   case MAT_CUSPARSE_MULT:
138     cusparsestruct->format = format;
139     break;
140   case MAT_CUSPARSE_ALL:
141     cusparsestruct->format = format;
142     break;
143   default:
144     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
145   }
146 #else
147   if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB)
148     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format require CUDA 4.2 or later.");
149 #endif
150   PetscFunctionReturn(0);
151 }
152 
153 /*@
154    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
155    operation. Only the MatMult operation can use different GPU storage formats
156    for MPIAIJCUSPARSE matrices.
157    Not Collective
158 
159    Input Parameters:
160 +  A - Matrix of type SEQAIJCUSPARSE
161 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
162 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
163 
164    Output Parameter:
165 
166    Level: intermediate
167 
168 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
169 @*/
170 #undef __FUNCT__
171 #define __FUNCT__ "MatCUSPARSESetFormat"
172 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
173 {
174   PetscErrorCode ierr;
175 
176   PetscFunctionBegin;
177   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
178   ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
179   PetscFunctionReturn(0);
180 }
181 
182 #undef __FUNCT__
183 #define __FUNCT__ "MatSetFromOptions_SeqAIJCUSPARSE"
184 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A)
185 {
186   PetscErrorCode           ierr;
187   MatCUSPARSEStorageFormat format;
188   PetscBool                flg;
189 
190   PetscFunctionBegin;
191   ierr = PetscOptionsHead("SeqAIJCUSPARSE options");CHKERRQ(ierr);
192   ierr = PetscObjectOptionsBegin((PetscObject)A);
193   if (A->factortype==MAT_FACTOR_NONE) {
194     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
195                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
196     if (flg) {
197       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);
198     }
199   }
200   ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
201                           "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
202   if (flg) {
203     ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
204   }
205   ierr = PetscOptionsEnd();CHKERRQ(ierr);
206   PetscFunctionReturn(0);
207 
208 }
209 
210 #undef __FUNCT__
211 #define __FUNCT__ "MatILUFactorSymbolic_SeqAIJCUSPARSE"
212 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
213 {
214   PetscErrorCode ierr;
215 
216   PetscFunctionBegin;
217   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
218   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
219   PetscFunctionReturn(0);
220 }
221 
222 #undef __FUNCT__
223 #define __FUNCT__ "MatLUFactorSymbolic_SeqAIJCUSPARSE"
224 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
225 {
226   PetscErrorCode ierr;
227 
228   PetscFunctionBegin;
229   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
230   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
231   PetscFunctionReturn(0);
232 }
233 
234 #undef __FUNCT__
235 #define __FUNCT__ "MatICCFactorSymbolic_SeqAIJCUSPARSE"
236 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
237 {
238   PetscErrorCode ierr;
239 
240   PetscFunctionBegin;
241   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
242   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
243   PetscFunctionReturn(0);
244 }
245 
246 #undef __FUNCT__
247 #define __FUNCT__ "MatCholeskyFactorSymbolic_SeqAIJCUSPARSE"
248 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
249 {
250   PetscErrorCode ierr;
251 
252   PetscFunctionBegin;
253   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
254   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
255   PetscFunctionReturn(0);
256 }
257 
258 #undef __FUNCT__
259 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildILULowerTriMatrix"
260 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
261 {
262   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
263   PetscInt                          n = A->rmap->n;
264   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
265   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
266   cusparseStatus_t                  stat;
267   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
268   const MatScalar                   *aa = a->a,*v;
269   PetscInt                          *AiLo, *AjLo;
270   PetscScalar                       *AALo;
271   PetscInt                          i,nz, nzLower, offset, rowOffset;
272   PetscErrorCode                    ierr;
273 
274   PetscFunctionBegin;
275   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
276     try {
277       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
278       nzLower=n+ai[n]-ai[1];
279 
280       /* Allocate Space for the lower triangular matrix */
281       ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
282       ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUSP(ierr);
283       ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUSP(ierr);
284 
285       /* Fill the lower triangular matrix */
286       AiLo[0]  = (PetscInt) 0;
287       AiLo[n]  = nzLower;
288       AjLo[0]  = (PetscInt) 0;
289       AALo[0]  = (MatScalar) 1.0;
290       v        = aa;
291       vi       = aj;
292       offset   = 1;
293       rowOffset= 1;
294       for (i=1; i<n; i++) {
295         nz = ai[i+1] - ai[i];
296         /* additional 1 for the term on the diagonal */
297         AiLo[i]    = rowOffset;
298         rowOffset += nz+1;
299 
300         ierr = PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr);
301         ierr = PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
302 
303         offset      += nz;
304         AjLo[offset] = (PetscInt) i;
305         AALo[offset] = (MatScalar) 1.0;
306         offset      += 1;
307 
308         v  += nz;
309         vi += nz;
310       }
311 
312       /* allocate space for the triangular factor information */
313       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
314 
315       /* Create the matrix description */
316       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSP(stat);
317       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat);
318       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat);
319       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSP(stat);
320       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSP(stat);
321 
322       /* Create the solve analysis information */
323       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSP(stat);
324 
325       /* set the operation */
326       loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
327 
328       /* set the matrix */
329       loTriFactor->csrMat = new CsrMatrix;
330       loTriFactor->csrMat->num_rows = n;
331       loTriFactor->csrMat->num_cols = n;
332       loTriFactor->csrMat->num_entries = nzLower;
333 
334       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
335       loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
336 
337       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
338       loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
339 
340       loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
341       loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
342 
343       /* perform the solve analysis */
344       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
345                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
346                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
347                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSP(stat);
348 
349       /* assign the pointer. Is this really necessary? */
350       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
351 
352       ierr = cudaFreeHost(AiLo);CHKERRCUSP(ierr);
353       ierr = cudaFreeHost(AjLo);CHKERRCUSP(ierr);
354       ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr);
355     } catch(char *ex) {
356       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
357     }
358   }
359   PetscFunctionReturn(0);
360 }
361 
362 #undef __FUNCT__
363 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildILUUpperTriMatrix"
364 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
365 {
366   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
367   PetscInt                          n = A->rmap->n;
368   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
369   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
370   cusparseStatus_t                  stat;
371   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
372   const MatScalar                   *aa = a->a,*v;
373   PetscInt                          *AiUp, *AjUp;
374   PetscScalar                       *AAUp;
375   PetscInt                          i,nz, nzUpper, offset;
376   PetscErrorCode                    ierr;
377 
378   PetscFunctionBegin;
379   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
380     try {
381       /* next, figure out the number of nonzeros in the upper triangular matrix. */
382       nzUpper = adiag[0]-adiag[n];
383 
384       /* Allocate Space for the upper triangular matrix */
385       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
386       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr);
387       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr);
388 
389       /* Fill the upper triangular matrix */
390       AiUp[0]=(PetscInt) 0;
391       AiUp[n]=nzUpper;
392       offset = nzUpper;
393       for (i=n-1; i>=0; i--) {
394         v  = aa + adiag[i+1] + 1;
395         vi = aj + adiag[i+1] + 1;
396 
397         /* number of elements NOT on the diagonal */
398         nz = adiag[i] - adiag[i+1]-1;
399 
400         /* decrement the offset */
401         offset -= (nz+1);
402 
403         /* first, set the diagonal elements */
404         AjUp[offset] = (PetscInt) i;
405         AAUp[offset] = 1./v[nz];
406         AiUp[i]      = AiUp[i+1] - (nz+1);
407 
408         ierr = PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr);
409         ierr = PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
410       }
411 
412       /* allocate space for the triangular factor information */
413       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
414 
415       /* Create the matrix description */
416       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSP(stat);
417       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat);
418       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat);
419       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat);
420       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat);
421 
422       /* Create the solve analysis information */
423       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSP(stat);
424 
425       /* set the operation */
426       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
427 
428       /* set the matrix */
429       upTriFactor->csrMat = new CsrMatrix;
430       upTriFactor->csrMat->num_rows = n;
431       upTriFactor->csrMat->num_cols = n;
432       upTriFactor->csrMat->num_entries = nzUpper;
433 
434       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
435       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
436 
437       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
438       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
439 
440       upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
441       upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
442 
443       /* perform the solve analysis */
444       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
445                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
446                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
447                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSP(stat);
448 
449       /* assign the pointer. Is this really necessary? */
450       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
451 
452       ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr);
453       ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr);
454       ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr);
455     } catch(char *ex) {
456       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
457     }
458   }
459   PetscFunctionReturn(0);
460 }
461 
462 #undef __FUNCT__
463 #define __FUNCT__ "MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU"
464 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
465 {
466   PetscErrorCode               ierr;
467   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
468   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
469   IS                           isrow = a->row,iscol = a->icol;
470   PetscBool                    row_identity,col_identity;
471   const PetscInt               *r,*c;
472   PetscInt                     n = A->rmap->n;
473 
474   PetscFunctionBegin;
475   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
476   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
477 
478   cusparseTriFactors->workVector = new THRUSTARRAY;
479   cusparseTriFactors->workVector->resize(n);
480   cusparseTriFactors->nnz=a->nz;
481 
482   A->valid_GPU_matrix = PETSC_CUSP_BOTH;
483   /*lower triangular indices */
484   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
485   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
486   if (!row_identity) {
487     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
488     cusparseTriFactors->rpermIndices->assign(r, r+n);
489   }
490   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
491 
492   /*upper triangular indices */
493   ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
494   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
495   if (!col_identity) {
496     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
497     cusparseTriFactors->cpermIndices->assign(c, c+n);
498   }
499   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
500   PetscFunctionReturn(0);
501 }
502 
503 #undef __FUNCT__
504 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildICCTriMatrices"
505 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
506 {
507   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
508   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
509   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
510   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
511   cusparseStatus_t                  stat;
512   PetscErrorCode                    ierr;
513   PetscInt                          *AiUp, *AjUp;
514   PetscScalar                       *AAUp;
515   PetscScalar                       *AALo;
516   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
517   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
518   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
519   const MatScalar                   *aa = b->a,*v;
520 
521   PetscFunctionBegin;
522   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
523     try {
524       /* Allocate Space for the upper triangular matrix */
525       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
526       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr);
527       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr);
528       ierr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr);
529 
530       /* Fill the upper triangular matrix */
531       AiUp[0]=(PetscInt) 0;
532       AiUp[n]=nzUpper;
533       offset = 0;
534       for (i=0; i<n; i++) {
535         /* set the pointers */
536         v  = aa + ai[i];
537         vj = aj + ai[i];
538         nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
539 
540         /* first, set the diagonal elements */
541         AjUp[offset] = (PetscInt) i;
542         AAUp[offset] = 1.0/v[nz];
543         AiUp[i]      = offset;
544         AALo[offset] = 1.0/v[nz];
545 
546         offset+=1;
547         if (nz>0) {
548           ierr = PetscMemcpy(&(AjUp[offset]), vj, nz*sizeof(PetscInt));CHKERRQ(ierr);
549           ierr = PetscMemcpy(&(AAUp[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
550           for (j=offset; j<offset+nz; j++) {
551             AAUp[j] = -AAUp[j];
552             AALo[j] = AAUp[j]/v[nz];
553           }
554           offset+=nz;
555         }
556       }
557 
558       /* allocate space for the triangular factor information */
559       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
560 
561       /* Create the matrix description */
562       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSP(stat);
563       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat);
564       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat);
565       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat);
566       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSP(stat);
567 
568       /* Create the solve analysis information */
569       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSP(stat);
570 
571       /* set the operation */
572       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
573 
574       /* set the matrix */
575       upTriFactor->csrMat = new CsrMatrix;
576       upTriFactor->csrMat->num_rows = A->rmap->n;
577       upTriFactor->csrMat->num_cols = A->cmap->n;
578       upTriFactor->csrMat->num_entries = a->nz;
579 
580       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
581       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
582 
583       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
584       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
585 
586       upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
587       upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
588 
589       /* perform the solve analysis */
590       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
591                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
592                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
593                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSP(stat);
594 
595       /* assign the pointer. Is this really necessary? */
596       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
597 
598       /* allocate space for the triangular factor information */
599       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
600 
601       /* Create the matrix description */
602       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSP(stat);
603       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat);
604       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat);
605       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat);
606       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat);
607 
608       /* Create the solve analysis information */
609       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSP(stat);
610 
611       /* set the operation */
612       loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
613 
614       /* set the matrix */
615       loTriFactor->csrMat = new CsrMatrix;
616       loTriFactor->csrMat->num_rows = A->rmap->n;
617       loTriFactor->csrMat->num_cols = A->cmap->n;
618       loTriFactor->csrMat->num_entries = a->nz;
619 
620       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
621       loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
622 
623       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
624       loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
625 
626       loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
627       loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
628 
629       /* perform the solve analysis */
630       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
631                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
632                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
633                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSP(stat);
634 
635       /* assign the pointer. Is this really necessary? */
636       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
637 
638       A->valid_GPU_matrix = PETSC_CUSP_BOTH;
639       ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr);
640       ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr);
641       ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr);
642       ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr);
643     } catch(char *ex) {
644       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
645     }
646   }
647   PetscFunctionReturn(0);
648 }
649 
650 #undef __FUNCT__
651 #define __FUNCT__ "MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU"
652 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
653 {
654   PetscErrorCode               ierr;
655   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
656   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
657   IS                           ip = a->row;
658   const PetscInt               *rip;
659   PetscBool                    perm_identity;
660   PetscInt                     n = A->rmap->n;
661 
662   PetscFunctionBegin;
663   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
664   cusparseTriFactors->workVector = new THRUSTARRAY;
665   cusparseTriFactors->workVector->resize(n);
666   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
667 
668   /*lower triangular indices */
669   ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
670   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
671   if (!perm_identity) {
672     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
673     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
674     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
675     cusparseTriFactors->cpermIndices->assign(rip, rip+n);
676   }
677   ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
678   PetscFunctionReturn(0);
679 }
680 
681 #undef __FUNCT__
682 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJCUSPARSE"
683 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
684 {
685   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
686   IS             isrow = b->row,iscol = b->col;
687   PetscBool      row_identity,col_identity;
688   PetscErrorCode ierr;
689 
690   PetscFunctionBegin;
691   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
692   /* determine which version of MatSolve needs to be used. */
693   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
694   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
695   if (row_identity && col_identity) {
696     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
697     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
698   } else {
699     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
700     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
701   }
702 
703   /* get the triangular factors */
704   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
705   PetscFunctionReturn(0);
706 }
707 
708 #undef __FUNCT__
709 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqAIJCUSPARSE"
710 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
711 {
712   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
713   IS             ip = b->row;
714   PetscBool      perm_identity;
715   PetscErrorCode ierr;
716 
717   PetscFunctionBegin;
718   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
719 
720   /* determine which version of MatSolve needs to be used. */
721   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
722   if (perm_identity) {
723     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
724     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
725   } else {
726     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
727     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
728   }
729 
730   /* get the triangular factors */
731   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
732   PetscFunctionReturn(0);
733 }
734 
735 #undef __FUNCT__
736 #define __FUNCT__ "MatSeqAIJCUSPARSEAnalyzeTransposeForSolve"
737 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
738 {
739   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
740   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
741   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
742   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
743   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
744   cusparseStatus_t                  stat;
745   cusparseIndexBase_t               indexBase;
746   cusparseMatrixType_t              matrixType;
747   cusparseFillMode_t                fillMode;
748   cusparseDiagType_t                diagType;
749 
750   PetscFunctionBegin;
751 
752   /*********************************************/
753   /* Now the Transpose of the Lower Tri Factor */
754   /*********************************************/
755 
756   /* allocate space for the transpose of the lower triangular factor */
757   loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
758 
759   /* set the matrix descriptors of the lower triangular factor */
760   matrixType = cusparseGetMatType(loTriFactor->descr);
761   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
762   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
763     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
764   diagType = cusparseGetMatDiagType(loTriFactor->descr);
765 
766   /* Create the matrix description */
767   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSP(stat);
768   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSP(stat);
769   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSP(stat);
770   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSP(stat);
771   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSP(stat);
772 
773   /* Create the solve analysis information */
774   stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUSP(stat);
775 
776   /* set the operation */
777   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
778 
779   /* allocate GPU space for the CSC of the lower triangular factor*/
780   loTriFactorT->csrMat = new CsrMatrix;
781   loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows;
782   loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols;
783   loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
784   loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1);
785   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries);
786   loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries);
787 
788   /* compute the transpose of the lower triangular factor, i.e. the CSC */
789   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
790                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
791                           loTriFactor->csrMat->values->data().get(),
792                           loTriFactor->csrMat->row_offsets->data().get(),
793                           loTriFactor->csrMat->column_indices->data().get(),
794                           loTriFactorT->csrMat->values->data().get(),
795                           loTriFactorT->csrMat->column_indices->data().get(),
796                           loTriFactorT->csrMat->row_offsets->data().get(),
797                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat);
798 
799   /* perform the solve analysis on the transposed matrix */
800   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
801                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries,
802                            loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
803                            loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(),
804                            loTriFactorT->solveInfo);CHKERRCUSP(stat);
805 
806   /* assign the pointer. Is this really necessary? */
807   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
808 
809   /*********************************************/
810   /* Now the Transpose of the Upper Tri Factor */
811   /*********************************************/
812 
813   /* allocate space for the transpose of the upper triangular factor */
814   upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
815 
816   /* set the matrix descriptors of the upper triangular factor */
817   matrixType = cusparseGetMatType(upTriFactor->descr);
818   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
819   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
820     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
821   diagType = cusparseGetMatDiagType(upTriFactor->descr);
822 
823   /* Create the matrix description */
824   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSP(stat);
825   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSP(stat);
826   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSP(stat);
827   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSP(stat);
828   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSP(stat);
829 
830   /* Create the solve analysis information */
831   stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUSP(stat);
832 
833   /* set the operation */
834   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
835 
836   /* allocate GPU space for the CSC of the upper triangular factor*/
837   upTriFactorT->csrMat = new CsrMatrix;
838   upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows;
839   upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols;
840   upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
841   upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1);
842   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries);
843   upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries);
844 
845   /* compute the transpose of the upper triangular factor, i.e. the CSC */
846   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
847                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
848                           upTriFactor->csrMat->values->data().get(),
849                           upTriFactor->csrMat->row_offsets->data().get(),
850                           upTriFactor->csrMat->column_indices->data().get(),
851                           upTriFactorT->csrMat->values->data().get(),
852                           upTriFactorT->csrMat->column_indices->data().get(),
853                           upTriFactorT->csrMat->row_offsets->data().get(),
854                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat);
855 
856   /* perform the solve analysis on the transposed matrix */
857   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
858                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries,
859                            upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
860                            upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(),
861                            upTriFactorT->solveInfo);CHKERRCUSP(stat);
862 
863   /* assign the pointer. Is this really necessary? */
864   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
865   PetscFunctionReturn(0);
866 }
867 
868 #undef __FUNCT__
869 #define __FUNCT__ "MatSeqAIJCUSPARSEGenerateTransposeForMult"
870 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
871 {
872   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
873   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
874   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
875   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
876   cusparseStatus_t             stat;
877   cusparseIndexBase_t          indexBase;
878   cudaError_t                  err;
879 
880   PetscFunctionBegin;
881 
882   /* allocate space for the triangular factor information */
883   matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
884   stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSP(stat);
885   indexBase = cusparseGetMatIndexBase(matstruct->descr);
886   stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSP(stat);
887   stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSP(stat);
888 
889   /* set alpha and beta */
890   err = cudaMalloc((void **)&(matstructT->alpha),sizeof(PetscScalar));CHKERRCUSP(err);
891   err = cudaMemcpy(matstructT->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err);
892   err = cudaMalloc((void **)&(matstructT->beta),sizeof(PetscScalar));CHKERRCUSP(err);
893   err = cudaMemcpy(matstructT->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err);
894   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSP(stat);
895 
896   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
897     CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
898     CsrMatrix *matrixT= new CsrMatrix;
899     matrixT->num_rows = A->rmap->n;
900     matrixT->num_cols = A->cmap->n;
901     matrixT->num_entries = a->nz;
902     matrixT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
903     matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
904     matrixT->values = new THRUSTARRAY(a->nz);
905 
906     /* compute the transpose of the upper triangular factor, i.e. the CSC */
907     indexBase = cusparseGetMatIndexBase(matstruct->descr);
908     stat = cusparse_csr2csc(cusparsestruct->handle, matrix->num_rows,
909                             matrix->num_cols, matrix->num_entries,
910                             matrix->values->data().get(),
911                             matrix->row_offsets->data().get(),
912                             matrix->column_indices->data().get(),
913                             matrixT->values->data().get(),
914                             matrixT->column_indices->data().get(),
915                             matrixT->row_offsets->data().get(),
916                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat);
917 
918     /* assign the pointer */
919     matstructT->mat = matrixT;
920 
921   } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
922 #if CUDA_VERSION>=5000
923     /* First convert HYB to CSR */
924     CsrMatrix *temp= new CsrMatrix;
925     temp->num_rows = A->rmap->n;
926     temp->num_cols = A->cmap->n;
927     temp->num_entries = a->nz;
928     temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
929     temp->column_indices = new THRUSTINTARRAY32(a->nz);
930     temp->values = new THRUSTARRAY(a->nz);
931 
932 
933     stat = cusparse_hyb2csr(cusparsestruct->handle,
934                             matstruct->descr, (cusparseHybMat_t)matstruct->mat,
935                             temp->values->data().get(),
936                             temp->row_offsets->data().get(),
937                             temp->column_indices->data().get());CHKERRCUSP(stat);
938 
939     /* Next, convert CSR to CSC (i.e. the matrix transpose) */
940     CsrMatrix *tempT= new CsrMatrix;
941     tempT->num_rows = A->rmap->n;
942     tempT->num_cols = A->cmap->n;
943     tempT->num_entries = a->nz;
944     tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
945     tempT->column_indices = new THRUSTINTARRAY32(a->nz);
946     tempT->values = new THRUSTARRAY(a->nz);
947 
948     stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
949                             temp->num_cols, temp->num_entries,
950                             temp->values->data().get(),
951                             temp->row_offsets->data().get(),
952                             temp->column_indices->data().get(),
953                             tempT->values->data().get(),
954                             tempT->column_indices->data().get(),
955                             tempT->row_offsets->data().get(),
956                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat);
957 
958     /* Last, convert CSC to HYB */
959     cusparseHybMat_t hybMat;
960     stat = cusparseCreateHybMat(&hybMat);CHKERRCUSP(stat);
961     cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
962       CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
963     stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
964                             matstructT->descr, tempT->values->data().get(),
965                             tempT->row_offsets->data().get(),
966                             tempT->column_indices->data().get(),
967                             hybMat, 0, partition);CHKERRCUSP(stat);
968 
969     /* assign the pointer */
970     matstructT->mat = hybMat;
971 
972     /* delete temporaries */
973     if (tempT) {
974       if (tempT->values) delete (THRUSTARRAY*) tempT->values;
975       if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
976       if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
977       delete (CsrMatrix*) tempT;
978     }
979     if (temp) {
980       if (temp->values) delete (THRUSTARRAY*) temp->values;
981       if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
982       if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
983       delete (CsrMatrix*) temp;
984     }
985 #else
986     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format for the Matrix Transpose (in MatMultTranspose) require CUDA 5.0 or later.");
987 #endif
988   }
989   /* assign the compressed row indices */
990   matstructT->cprowIndices = new THRUSTINTARRAY;
991 
992   /* assign the pointer */
993   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
994   PetscFunctionReturn(0);
995 }
996 
997 #undef __FUNCT__
998 #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE"
999 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1000 {
1001   CUSPARRAY                         *xGPU, *bGPU;
1002   cusparseStatus_t                  stat;
1003   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1004   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1005   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1006   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1007   PetscErrorCode                    ierr;
1008 
1009   PetscFunctionBegin;
1010   /* Analyze the matrix and create the transpose ... on the fly */
1011   if (!loTriFactorT && !upTriFactorT) {
1012     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1013     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1014     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1015   }
1016 
1017   /* Get the GPU pointers */
1018   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1019   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
1020 
1021   /* First, reorder with the row permutation */
1022   thrust::copy(thrust::make_permutation_iterator(bGPU->begin(), cusparseTriFactors->rpermIndices->begin()),
1023                thrust::make_permutation_iterator(bGPU->end(), cusparseTriFactors->rpermIndices->end()),
1024                xGPU->begin());
1025 
1026   /* First, solve U */
1027   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1028                         upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr,
1029                         upTriFactorT->csrMat->values->data().get(),
1030                         upTriFactorT->csrMat->row_offsets->data().get(),
1031                         upTriFactorT->csrMat->column_indices->data().get(),
1032                         upTriFactorT->solveInfo,
1033                         xGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat);
1034 
1035   /* Then, solve L */
1036   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1037                         loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr,
1038                         loTriFactorT->csrMat->values->data().get(),
1039                         loTriFactorT->csrMat->row_offsets->data().get(),
1040                         loTriFactorT->csrMat->column_indices->data().get(),
1041                         loTriFactorT->solveInfo,
1042                         tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat);
1043 
1044   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1045   thrust::copy(thrust::make_permutation_iterator(xGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1046                thrust::make_permutation_iterator(xGPU->end(), cusparseTriFactors->cpermIndices->end()),
1047                tempGPU->begin());
1048 
1049   /* Copy the temporary to the full solution. */
1050   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU->begin());
1051 
1052   /* restore */
1053   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
1054   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1055   ierr = WaitForGPU();CHKERRCUSP(ierr);
1056 
1057   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1058   PetscFunctionReturn(0);
1059 }
1060 
1061 #undef __FUNCT__
1062 #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering"
1063 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1064 {
1065   CUSPARRAY                         *xGPU,*bGPU;
1066   cusparseStatus_t                  stat;
1067   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1068   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1069   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1070   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1071   PetscErrorCode                    ierr;
1072 
1073   PetscFunctionBegin;
1074   /* Analyze the matrix and create the transpose ... on the fly */
1075   if (!loTriFactorT && !upTriFactorT) {
1076     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1077     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1078     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1079   }
1080 
1081   /* Get the GPU pointers */
1082   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1083   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
1084 
1085   /* First, solve U */
1086   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1087                         upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr,
1088                         upTriFactorT->csrMat->values->data().get(),
1089                         upTriFactorT->csrMat->row_offsets->data().get(),
1090                         upTriFactorT->csrMat->column_indices->data().get(),
1091                         upTriFactorT->solveInfo,
1092                         bGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat);
1093 
1094   /* Then, solve L */
1095   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1096                         loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr,
1097                         loTriFactorT->csrMat->values->data().get(),
1098                         loTriFactorT->csrMat->row_offsets->data().get(),
1099                         loTriFactorT->csrMat->column_indices->data().get(),
1100                         loTriFactorT->solveInfo,
1101                         tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat);
1102 
1103   /* restore */
1104   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
1105   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1106   ierr = WaitForGPU();CHKERRCUSP(ierr);
1107   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1108   PetscFunctionReturn(0);
1109 }
1110 
1111 #undef __FUNCT__
1112 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE"
1113 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1114 {
1115   CUSPARRAY                         *xGPU,*bGPU;
1116   cusparseStatus_t                  stat;
1117   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1118   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1119   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1120   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1121   PetscErrorCode                    ierr;
1122 
1123   PetscFunctionBegin;
1124   /* Get the GPU pointers */
1125   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1126   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
1127 
1128   /* First, reorder with the row permutation */
1129   thrust::copy(thrust::make_permutation_iterator(bGPU->begin(), cusparseTriFactors->rpermIndices->begin()),
1130                thrust::make_permutation_iterator(bGPU->end(), cusparseTriFactors->rpermIndices->end()),
1131                xGPU->begin());
1132 
1133   /* Next, solve L */
1134   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1135                         loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr,
1136                         loTriFactor->csrMat->values->data().get(),
1137                         loTriFactor->csrMat->row_offsets->data().get(),
1138                         loTriFactor->csrMat->column_indices->data().get(),
1139                         loTriFactor->solveInfo,
1140                         xGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat);
1141 
1142   /* Then, solve U */
1143   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1144                         upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr,
1145                         upTriFactor->csrMat->values->data().get(),
1146                         upTriFactor->csrMat->row_offsets->data().get(),
1147                         upTriFactor->csrMat->column_indices->data().get(),
1148                         upTriFactor->solveInfo,
1149                         tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat);
1150 
1151   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1152   thrust::copy(thrust::make_permutation_iterator(xGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1153                thrust::make_permutation_iterator(xGPU->end(), cusparseTriFactors->cpermIndices->end()),
1154                tempGPU->begin());
1155 
1156   /* Copy the temporary to the full solution. */
1157   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU->begin());
1158 
1159   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
1160   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1161   ierr = WaitForGPU();CHKERRCUSP(ierr);
1162   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1163   PetscFunctionReturn(0);
1164 }
1165 
1166 #undef __FUNCT__
1167 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE_NaturalOrdering"
1168 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1169 {
1170   CUSPARRAY                         *xGPU,*bGPU;
1171   cusparseStatus_t                  stat;
1172   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1173   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1174   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1175   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1176   PetscErrorCode                    ierr;
1177 
1178   PetscFunctionBegin;
1179   /* Get the GPU pointers */
1180   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1181   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
1182 
1183   /* First, solve L */
1184   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1185                         loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr,
1186                         loTriFactor->csrMat->values->data().get(),
1187                         loTriFactor->csrMat->row_offsets->data().get(),
1188                         loTriFactor->csrMat->column_indices->data().get(),
1189                         loTriFactor->solveInfo,
1190                         bGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat);
1191 
1192   /* Next, solve U */
1193   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1194                         upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr,
1195                         upTriFactor->csrMat->values->data().get(),
1196                         upTriFactor->csrMat->row_offsets->data().get(),
1197                         upTriFactor->csrMat->column_indices->data().get(),
1198                         upTriFactor->solveInfo,
1199                         tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat);
1200 
1201   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
1202   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1203   ierr = WaitForGPU();CHKERRCUSP(ierr);
1204   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1205   PetscFunctionReturn(0);
1206 }
1207 
1208 #undef __FUNCT__
1209 #define __FUNCT__ "MatSeqAIJCUSPARSECopyToGPU"
1210 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1211 {
1212 
1213   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1214   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1215   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1216   PetscInt                     m = A->rmap->n,*ii,*ridx;
1217   PetscErrorCode               ierr;
1218   cusparseStatus_t             stat;
1219   cudaError_t                  err;
1220 
1221   PetscFunctionBegin;
1222   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
1223     ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1224     if (matstruct) {
1225       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Data structure should not be initialized.");
1226     }
1227     try {
1228       cusparsestruct->nonzerorow=0;
1229       for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0);
1230 
1231       if (a->compressedrow.use) {
1232         m    = a->compressedrow.nrows;
1233         ii   = a->compressedrow.i;
1234         ridx = a->compressedrow.rindex;
1235       } else {
1236         /* Forcing compressed row on the GPU */
1237         int k=0;
1238         ierr = PetscMalloc1((cusparsestruct->nonzerorow+1), &ii);CHKERRQ(ierr);
1239         ierr = PetscMalloc1((cusparsestruct->nonzerorow), &ridx);CHKERRQ(ierr);
1240         ii[0]=0;
1241         for (int j = 0; j<m; j++) {
1242           if ((a->i[j+1]-a->i[j])>0) {
1243             ii[k]  = a->i[j];
1244             ridx[k]= j;
1245             k++;
1246           }
1247         }
1248         ii[cusparsestruct->nonzerorow] = a->nz;
1249         m = cusparsestruct->nonzerorow;
1250       }
1251 
1252       /* allocate space for the triangular factor information */
1253       matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1254       stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSP(stat);
1255       stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat);
1256       stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSP(stat);
1257 
1258       err = cudaMalloc((void **)&(matstruct->alpha),sizeof(PetscScalar));CHKERRCUSP(err);
1259       err = cudaMemcpy(matstruct->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err);
1260       err = cudaMalloc((void **)&(matstruct->beta),sizeof(PetscScalar));CHKERRCUSP(err);
1261       err = cudaMemcpy(matstruct->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUSP(err);
1262       stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSP(stat);
1263 
1264       /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1265       if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1266         /* set the matrix */
1267 	CsrMatrix *matrix= new CsrMatrix;
1268 	matrix->num_rows = m;
1269 	matrix->num_cols = A->cmap->n;
1270 	matrix->num_entries = a->nz;
1271 	matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1272 	matrix->row_offsets->assign(ii, ii + m+1);
1273 
1274 	matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1275 	matrix->column_indices->assign(a->j, a->j+a->nz);
1276 
1277 	matrix->values = new THRUSTARRAY(a->nz);
1278 	matrix->values->assign(a->a, a->a+a->nz);
1279 
1280         /* assign the pointer */
1281         matstruct->mat = matrix;
1282 
1283       } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1284 #if CUDA_VERSION>=4020
1285 	CsrMatrix *matrix= new CsrMatrix;
1286 	matrix->num_rows = m;
1287 	matrix->num_cols = A->cmap->n;
1288 	matrix->num_entries = a->nz;
1289 	matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1290 	matrix->row_offsets->assign(ii, ii + m+1);
1291 
1292 	matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1293 	matrix->column_indices->assign(a->j, a->j+a->nz);
1294 
1295 	matrix->values = new THRUSTARRAY(a->nz);
1296 	matrix->values->assign(a->a, a->a+a->nz);
1297 
1298         cusparseHybMat_t hybMat;
1299         stat = cusparseCreateHybMat(&hybMat);CHKERRCUSP(stat);
1300         cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1301           CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1302         stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols,
1303                                 matstruct->descr, matrix->values->data().get(),
1304                                 matrix->row_offsets->data().get(),
1305                                 matrix->column_indices->data().get(),
1306                                 hybMat, 0, partition);CHKERRCUSP(stat);
1307         /* assign the pointer */
1308         matstruct->mat = hybMat;
1309 
1310 	if (matrix) {
1311 	  if (matrix->values) delete (THRUSTARRAY*)matrix->values;
1312 	  if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices;
1313 	  if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets;
1314 	  delete (CsrMatrix*)matrix;
1315 	}
1316 #endif
1317       }
1318 
1319       /* assign the compressed row indices */
1320       matstruct->cprowIndices = new THRUSTINTARRAY(m);
1321       matstruct->cprowIndices->assign(ridx,ridx+m);
1322 
1323       /* assign the pointer */
1324       cusparsestruct->mat = matstruct;
1325 
1326       if (!a->compressedrow.use) {
1327         ierr = PetscFree(ii);CHKERRQ(ierr);
1328         ierr = PetscFree(ridx);CHKERRQ(ierr);
1329       }
1330       cusparsestruct->workVector = new THRUSTARRAY;
1331       cusparsestruct->workVector->resize(m);
1332     } catch(char *ex) {
1333       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1334     }
1335     ierr = WaitForGPU();CHKERRCUSP(ierr);
1336 
1337     A->valid_GPU_matrix = PETSC_CUSP_BOTH;
1338 
1339     ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1340   }
1341   PetscFunctionReturn(0);
1342 }
1343 
1344 #undef __FUNCT__
1345 #define __FUNCT__ "MatCreateVecs_SeqAIJCUSPARSE"
1346 static PetscErrorCode MatCreateVecs_SeqAIJCUSPARSE(Mat mat, Vec *right, Vec *left)
1347 {
1348   PetscErrorCode ierr;
1349   PetscInt rbs,cbs;
1350 
1351   PetscFunctionBegin;
1352   ierr = MatGetBlockSizes(mat,&rbs,&cbs);CHKERRQ(ierr);
1353   if (right) {
1354     ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr);
1355     ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
1356     ierr = VecSetBlockSize(*right,cbs);CHKERRQ(ierr);
1357     ierr = VecSetType(*right,VECSEQCUSP);CHKERRQ(ierr);
1358     ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr);
1359   }
1360   if (left) {
1361     ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr);
1362     ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
1363     ierr = VecSetBlockSize(*left,rbs);CHKERRQ(ierr);
1364     ierr = VecSetType(*left,VECSEQCUSP);CHKERRQ(ierr);
1365     ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr);
1366   }
1367   PetscFunctionReturn(0);
1368 }
1369 
1370 struct VecCUSPPlusEquals
1371 {
1372   template <typename Tuple>
1373   __host__ __device__
1374   void operator()(Tuple t)
1375   {
1376     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1377   }
1378 };
1379 
1380 #undef __FUNCT__
1381 #define __FUNCT__ "MatMult_SeqAIJCUSPARSE"
1382 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1383 {
1384   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1385   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1386   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1387   CUSPARRAY                    *xarray,*yarray;
1388   PetscErrorCode               ierr;
1389   cusparseStatus_t             stat;
1390 
1391   PetscFunctionBegin;
1392   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
1393      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
1394   ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1395   ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
1396   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1397     CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1398     stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1399                              mat->num_rows, mat->num_cols, mat->num_entries,
1400                              matstruct->alpha, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(),
1401                              mat->column_indices->data().get(), xarray->data().get(), matstruct->beta,
1402                              yarray->data().get());CHKERRCUSP(stat);
1403   } else {
1404 #if CUDA_VERSION>=4020
1405     cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1406     stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1407                              matstruct->alpha, matstruct->descr, hybMat,
1408                              xarray->data().get(), matstruct->beta,
1409                              yarray->data().get());CHKERRCUSP(stat);
1410 #endif
1411   }
1412   ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1413   ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
1414   if (!cusparsestruct->stream) {
1415     ierr = WaitForGPU();CHKERRCUSP(ierr);
1416   }
1417   ierr = PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr);
1418   PetscFunctionReturn(0);
1419 }
1420 
1421 #undef __FUNCT__
1422 #define __FUNCT__ "MatMultTranspose_SeqAIJCUSPARSE"
1423 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1424 {
1425   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1426   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1427   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1428   CUSPARRAY                    *xarray,*yarray;
1429   PetscErrorCode               ierr;
1430   cusparseStatus_t             stat;
1431 
1432   PetscFunctionBegin;
1433   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
1434      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
1435   if (!matstructT) {
1436     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1437     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1438   }
1439   ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1440   ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
1441 
1442   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1443     CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1444     stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1445                              mat->num_rows, mat->num_cols,
1446                              mat->num_entries, matstructT->alpha, matstructT->descr,
1447                              mat->values->data().get(), mat->row_offsets->data().get(),
1448                              mat->column_indices->data().get(), xarray->data().get(), matstructT->beta,
1449                              yarray->data().get());CHKERRCUSP(stat);
1450   } else {
1451 #if CUDA_VERSION>=4020
1452     cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1453     stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1454                              matstructT->alpha, matstructT->descr, hybMat,
1455                              xarray->data().get(), matstructT->beta,
1456                              yarray->data().get());CHKERRCUSP(stat);
1457 #endif
1458   }
1459   ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1460   ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
1461   if (!cusparsestruct->stream) {
1462     ierr = WaitForGPU();CHKERRCUSP(ierr);
1463   }
1464   ierr = PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr);
1465   PetscFunctionReturn(0);
1466 }
1467 
1468 
1469 #undef __FUNCT__
1470 #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE"
1471 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1472 {
1473   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1474   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1475   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1476   CUSPARRAY                    *xarray,*yarray,*zarray;
1477   PetscErrorCode               ierr;
1478   cusparseStatus_t             stat;
1479 
1480   PetscFunctionBegin;
1481   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
1482      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
1483   try {
1484     ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr);
1485     ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1486     ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr);
1487     ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
1488 
1489     /* multiply add */
1490     if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1491       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1492       /* here we need to be careful to set the number of rows in the multiply to the
1493 	 number of compressed rows in the matrix ... which is equivalent to the
1494 	 size of the workVector */
1495       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1496                                mat->num_rows, mat->num_cols,
1497                                mat->num_entries, matstruct->alpha, matstruct->descr,
1498                                mat->values->data().get(), mat->row_offsets->data().get(),
1499                                mat->column_indices->data().get(), xarray->data().get(), matstruct->beta,
1500                                cusparsestruct->workVector->data().get());CHKERRCUSP(stat);
1501     } else {
1502 #if CUDA_VERSION>=4020
1503       cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1504       if (cusparsestruct->workVector->size()) {
1505 	stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1506 				 matstruct->alpha, matstruct->descr, hybMat,
1507 				 xarray->data().get(), matstruct->beta,
1508 				 cusparsestruct->workVector->data().get());CHKERRCUSP(stat);
1509       }
1510 #endif
1511     }
1512 
1513     /* scatter the data from the temporary into the full vector with a += operation */
1514     thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstruct->cprowIndices->begin()))),
1515 		     thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(),
1516 		     VecCUSPPlusEquals());
1517     ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1518     ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr);
1519     ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
1520 
1521   } catch(char *ex) {
1522     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1523   }
1524   ierr = WaitForGPU();CHKERRCUSP(ierr);
1525   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1526   PetscFunctionReturn(0);
1527 }
1528 
1529 #undef __FUNCT__
1530 #define __FUNCT__ "MatMultTransposeAdd_SeqAIJCUSPARSE"
1531 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1532 {
1533   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1534   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1535   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1536   CUSPARRAY                    *xarray,*yarray,*zarray;
1537   PetscErrorCode               ierr;
1538   cusparseStatus_t             stat;
1539 
1540   PetscFunctionBegin;
1541   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
1542      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
1543   if (!matstructT) {
1544     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1545     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1546   }
1547 
1548   try {
1549     ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr);
1550     ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1551     ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr);
1552     ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
1553 
1554     /* multiply add with matrix transpose */
1555     if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1556       CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1557       /* here we need to be careful to set the number of rows in the multiply to the
1558 	 number of compressed rows in the matrix ... which is equivalent to the
1559 	 size of the workVector */
1560       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1561                                mat->num_rows, mat->num_cols,
1562 			       mat->num_entries, matstructT->alpha, matstructT->descr,
1563                                mat->values->data().get(), mat->row_offsets->data().get(),
1564                                mat->column_indices->data().get(), xarray->data().get(), matstructT->beta,
1565                                cusparsestruct->workVector->data().get());CHKERRCUSP(stat);
1566     } else {
1567 #if CUDA_VERSION>=4020
1568       cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1569       if (cusparsestruct->workVector->size()) {
1570 	stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1571 				 matstructT->alpha, matstructT->descr, hybMat,
1572 				 xarray->data().get(), matstructT->beta,
1573 				 cusparsestruct->workVector->data().get());CHKERRCUSP(stat);
1574       }
1575 #endif
1576     }
1577 
1578     /* scatter the data from the temporary into the full vector with a += operation */
1579     thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstructT->cprowIndices->begin()))),
1580 		     thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstructT->cprowIndices->begin()))) + cusparsestruct->workVector->size(),
1581 		     VecCUSPPlusEquals());
1582 
1583     ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1584     ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr);
1585     ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
1586 
1587   } catch(char *ex) {
1588     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1589   }
1590   ierr = WaitForGPU();CHKERRCUSP(ierr);
1591   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1592   PetscFunctionReturn(0);
1593 }
1594 
1595 #undef __FUNCT__
1596 #define __FUNCT__ "MatAssemblyEnd_SeqAIJCUSPARSE"
1597 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
1598 {
1599   PetscErrorCode ierr;
1600 
1601   PetscFunctionBegin;
1602   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
1603   if (A->factortype==MAT_FACTOR_NONE) {
1604     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1605   }
1606   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
1607   A->ops->mult             = MatMult_SeqAIJCUSPARSE;
1608   A->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1609   A->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1610   A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1611   PetscFunctionReturn(0);
1612 }
1613 
1614 /* --------------------------------------------------------------------------------*/
1615 #undef __FUNCT__
1616 #define __FUNCT__ "MatCreateSeqAIJCUSPARSE"
1617 /*@
1618    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
1619    (the default parallel PETSc format). This matrix will ultimately pushed down
1620    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
1621    assembly performance the user should preallocate the matrix storage by setting
1622    the parameter nz (or the array nnz).  By setting these parameters accurately,
1623    performance during matrix assembly can be increased by more than a factor of 50.
1624 
1625    Collective on MPI_Comm
1626 
1627    Input Parameters:
1628 +  comm - MPI communicator, set to PETSC_COMM_SELF
1629 .  m - number of rows
1630 .  n - number of columns
1631 .  nz - number of nonzeros per row (same for all rows)
1632 -  nnz - array containing the number of nonzeros in the various rows
1633          (possibly different for each row) or NULL
1634 
1635    Output Parameter:
1636 .  A - the matrix
1637 
1638    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1639    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1640    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
1641 
1642    Notes:
1643    If nnz is given then nz is ignored
1644 
1645    The AIJ format (also called the Yale sparse matrix format or
1646    compressed row storage), is fully compatible with standard Fortran 77
1647    storage.  That is, the stored row and column indices can begin at
1648    either one (as in Fortran) or zero.  See the users' manual for details.
1649 
1650    Specify the preallocated storage with either nz or nnz (not both).
1651    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1652    allocation.  For large problems you MUST preallocate memory or you
1653    will get TERRIBLE performance, see the users' manual chapter on matrices.
1654 
1655    By default, this format uses inodes (identical nodes) when possible, to
1656    improve numerical efficiency of matrix-vector products and solves. We
1657    search for consecutive rows with the same nonzero structure, thereby
1658    reusing matrix information to achieve increased efficiency.
1659 
1660    Level: intermediate
1661 
1662 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
1663 @*/
1664 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1665 {
1666   PetscErrorCode ierr;
1667 
1668   PetscFunctionBegin;
1669   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1670   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
1671   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1672   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
1673   PetscFunctionReturn(0);
1674 }
1675 
1676 #undef __FUNCT__
1677 #define __FUNCT__ "MatDestroy_SeqAIJCUSPARSE"
1678 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
1679 {
1680   PetscErrorCode   ierr;
1681 
1682   PetscFunctionBegin;
1683   if (A->factortype==MAT_FACTOR_NONE) {
1684     if (A->valid_GPU_matrix != PETSC_CUSP_UNALLOCATED) {
1685       ierr = Mat_SeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
1686     }
1687   } else {
1688     ierr = Mat_SeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
1689   }
1690   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
1691   PetscFunctionReturn(0);
1692 }
1693 
1694 #undef __FUNCT__
1695 #define __FUNCT__ "MatCreate_SeqAIJCUSPARSE"
1696 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
1697 {
1698   PetscErrorCode ierr;
1699   cusparseStatus_t stat;
1700   cusparseHandle_t handle=0;
1701 
1702   PetscFunctionBegin;
1703   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
1704   if (B->factortype==MAT_FACTOR_NONE) {
1705     /* you cannot check the inode.use flag here since the matrix was just created.
1706        now build a GPU matrix data structure */
1707     B->spptr = new Mat_SeqAIJCUSPARSE;
1708     ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat          = 0;
1709     ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0;
1710     ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector   = 0;
1711     ((Mat_SeqAIJCUSPARSE*)B->spptr)->format       = MAT_CUSPARSE_CSR;
1712     ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream       = 0;
1713     ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle       = 0;
1714     stat = cusparseCreate(&handle);CHKERRCUSP(stat);
1715     ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle       = handle;
1716     ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream       = 0;
1717   } else {
1718     /* NEXT, set the pointers to the triangular factors */
1719     B->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1720     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr          = 0;
1721     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr          = 0;
1722     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0;
1723     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0;
1724     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices            = 0;
1725     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices            = 0;
1726     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector              = 0;
1727     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle                  = 0;
1728     stat = cusparseCreate(&handle);CHKERRCUSP(stat);
1729     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle                  = handle;
1730     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz                     = 0;
1731   }
1732 
1733   /* Here we overload MatGetFactor_petsc_C which enables -mat_type aijcusparse to use the
1734      default cusparse tri solve. Note the difference with the implementation in
1735      MatCreate_SeqAIJCUSP in ../seqcusp/aijcusp.cu */
1736   ierr = PetscObjectComposeFunction((PetscObject)B,"MatGetFactor_petsc_C",MatGetFactor_seqaij_cusparse);CHKERRQ(ierr);
1737 
1738   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
1739   B->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
1740   B->ops->getvecs          = MatCreateVecs_SeqAIJCUSPARSE;
1741   B->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
1742   B->ops->mult             = MatMult_SeqAIJCUSPARSE;
1743   B->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1744   B->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1745   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1746 
1747   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1748 
1749   B->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED;
1750 
1751   ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
1752   PetscFunctionReturn(0);
1753 }
1754 
1755 /*M
1756    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
1757 
1758    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
1759    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
1760    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
1761 
1762    Options Database Keys:
1763 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
1764 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1765 .  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1766 
1767   Level: beginner
1768 
1769 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
1770 M*/
1771 
1772 #undef __FUNCT__
1773 #define __FUNCT__ "Mat_SeqAIJCUSPARSE_Destroy"
1774 static PetscErrorCode Mat_SeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
1775 {
1776   cusparseStatus_t stat;
1777   cusparseHandle_t handle;
1778 
1779   PetscFunctionBegin;
1780   if (*cusparsestruct) {
1781     Mat_SeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);
1782     Mat_SeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);
1783     delete (*cusparsestruct)->workVector;
1784     if (handle = (*cusparsestruct)->handle) {
1785       stat = cusparseDestroy(handle);CHKERRCUSP(stat);
1786     }
1787     delete *cusparsestruct;
1788     *cusparsestruct = 0;
1789   }
1790   PetscFunctionReturn(0);
1791 }
1792 
1793 #undef __FUNCT__
1794 #define __FUNCT__ "CsrMatrix_Destroy"
1795 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
1796 {
1797   PetscFunctionBegin;
1798   if (*mat) {
1799     delete (*mat)->values;
1800     delete (*mat)->column_indices;
1801     delete (*mat)->row_offsets;
1802     delete *mat;
1803     *mat = 0;
1804   }
1805   PetscFunctionReturn(0);
1806 }
1807 
1808 #undef __FUNCT__
1809 #define __FUNCT__ "Mat_SeqAIJCUSPARSETriFactorStruct_Destroy"
1810 static PetscErrorCode Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
1811 {
1812   cusparseStatus_t stat;
1813   PetscErrorCode   ierr;
1814 
1815   PetscFunctionBegin;
1816   if (*trifactor) {
1817     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSP(stat); }
1818     if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUSP(stat); }
1819     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
1820     delete *trifactor;
1821     *trifactor = 0;
1822   }
1823   PetscFunctionReturn(0);
1824 }
1825 
1826 #undef __FUNCT__
1827 #define __FUNCT__ "Mat_SeqAIJCUSPARSEMultStruct_Destroy"
1828 static PetscErrorCode Mat_SeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
1829 {
1830   CsrMatrix        *mat;
1831   cusparseStatus_t stat;
1832   cudaError_t      err;
1833 
1834   PetscFunctionBegin;
1835   if (*matstruct) {
1836     if ((*matstruct)->mat) {
1837       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
1838         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
1839         stat = cusparseDestroyHybMat(hybMat);CHKERRCUSP(stat);
1840       } else {
1841         mat = (CsrMatrix*)(*matstruct)->mat;
1842         CsrMatrix_Destroy(&mat);
1843       }
1844     }
1845     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSP(stat); }
1846     delete (*matstruct)->cprowIndices;
1847     if ((*matstruct)->alpha) { err=cudaFree((*matstruct)->alpha);CHKERRCUSP(err); }
1848     if ((*matstruct)->beta) { err=cudaFree((*matstruct)->beta);CHKERRCUSP(err); }
1849     delete *matstruct;
1850     *matstruct = 0;
1851   }
1852   PetscFunctionReturn(0);
1853 }
1854 
1855 #undef __FUNCT__
1856 #define __FUNCT__ "Mat_SeqAIJCUSPARSETriFactors_Destroy"
1857 static PetscErrorCode Mat_SeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
1858 {
1859   cusparseHandle_t handle;
1860   cusparseStatus_t stat;
1861 
1862   PetscFunctionBegin;
1863   if (*trifactors) {
1864     Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->loTriFactorPtr);
1865     Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->upTriFactorPtr);
1866     Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);
1867     Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);
1868     delete (*trifactors)->rpermIndices;
1869     delete (*trifactors)->cpermIndices;
1870     delete (*trifactors)->workVector;
1871     if (handle = (*trifactors)->handle) {
1872       stat = cusparseDestroy(handle);CHKERRCUSP(stat);
1873     }
1874     delete *trifactors;
1875     *trifactors = 0;
1876   }
1877   PetscFunctionReturn(0);
1878 }
1879 
1880