xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 16a2e217b25f471615c5ac9018fb31428bdf9a94)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 
7 #include <petscconf.h>
8 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
9 #include <../src/mat/impls/sbaij/seq/sbaij.h>
10 #include <../src/vec/vec/impls/dvecimpl.h>
11 #include <petsc/private/vecimpl.h>
12 #undef VecType
13 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
14 
15 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
16 
17 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
18 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
19 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
20 
21 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
22 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
23 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
24 
25 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
26 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
27 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
28 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
29 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
30 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
31 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
32 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
33 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
34 
35 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
36 static PetscErrorCode Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
37 static PetscErrorCode Mat_SeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
38 static PetscErrorCode Mat_SeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
39 static PetscErrorCode Mat_SeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
40 
41 #undef __FUNCT__
42 #define __FUNCT__ "MatCUSPARSESetStream"
43 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
44 {
45   cusparseStatus_t   stat;
46   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
47 
48   PetscFunctionBegin;
49   cusparsestruct->stream = stream;
50   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUDA(stat);
51   PetscFunctionReturn(0);
52 }
53 
54 #undef __FUNCT__
55 #define __FUNCT__ "MatCUSPARSESetHandle"
56 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
57 {
58   cusparseStatus_t   stat;
59   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
60 
61   PetscFunctionBegin;
62   if (cusparsestruct->handle) {
63     stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUDA(stat);
64   }
65   cusparsestruct->handle = handle;
66   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
67   PetscFunctionReturn(0);
68 }
69 
70 #undef __FUNCT__
71 #define __FUNCT__ "MatCUSPARSEClearHandle"
72 PetscErrorCode MatCUSPARSEClearHandle(Mat A)
73 {
74   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
75   PetscFunctionBegin;
76   if (cusparsestruct->handle)
77     cusparsestruct->handle = 0;
78   PetscFunctionReturn(0);
79 }
80 
81 #undef __FUNCT__
82 #define __FUNCT__ "MatFactorGetSolverPackage_seqaij_cusparse"
83 PetscErrorCode MatFactorGetSolverPackage_seqaij_cusparse(Mat A,const MatSolverPackage *type)
84 {
85   PetscFunctionBegin;
86   *type = MATSOLVERCUSPARSE;
87   PetscFunctionReturn(0);
88 }
89 
90 /*MC
91   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
92   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
93   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
94   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
95   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
96   algorithms are not recommended. This class does NOT support direct solver operations.
97 
98   Level: beginner
99 
100 .seealso: PCFactorSetMatSolverPackage(), MatSolverPackage, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
101 M*/
102 
103 #undef __FUNCT__
104 #define __FUNCT__ "MatGetFactor_seqaijcusparse_cusparse"
105 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
106 {
107   PetscErrorCode ierr;
108   PetscInt       n = A->rmap->n;
109 
110   PetscFunctionBegin;
111   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
112   (*B)->factortype = ftype;
113   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
114   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
115 
116   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
117     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
118     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
119     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
120   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
121     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
122     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
123   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
124 
125   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
126   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverPackage_C",MatFactorGetSolverPackage_seqaij_cusparse);CHKERRQ(ierr);
127   PetscFunctionReturn(0);
128 }
129 
130 #undef __FUNCT__
131 #define __FUNCT__ "MatCUSPARSESetFormat_SeqAIJCUSPARSE"
132 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
133 {
134   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
135 
136   PetscFunctionBegin;
137 #if CUDA_VERSION>=4020
138   switch (op) {
139   case MAT_CUSPARSE_MULT:
140     cusparsestruct->format = format;
141     break;
142   case MAT_CUSPARSE_ALL:
143     cusparsestruct->format = format;
144     break;
145   default:
146     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
147   }
148 #else
149   if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format require CUDA 4.2 or later.");
150 #endif
151   PetscFunctionReturn(0);
152 }
153 
154 /*@
155    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
156    operation. Only the MatMult operation can use different GPU storage formats
157    for MPIAIJCUSPARSE matrices.
158    Not Collective
159 
160    Input Parameters:
161 +  A - Matrix of type SEQAIJCUSPARSE
162 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
163 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
164 
165    Output Parameter:
166 
167    Level: intermediate
168 
169 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
170 @*/
171 #undef __FUNCT__
172 #define __FUNCT__ "MatCUSPARSESetFormat"
173 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
174 {
175   PetscErrorCode ierr;
176 
177   PetscFunctionBegin;
178   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
179   ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
180   PetscFunctionReturn(0);
181 }
182 
183 #undef __FUNCT__
184 #define __FUNCT__ "MatSetFromOptions_SeqAIJCUSPARSE"
185 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
186 {
187   PetscErrorCode           ierr;
188   MatCUSPARSEStorageFormat format;
189   PetscBool                flg;
190   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
191 
192   PetscFunctionBegin;
193   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
194   ierr = PetscObjectOptionsBegin((PetscObject)A);
195   if (A->factortype==MAT_FACTOR_NONE) {
196     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
197                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
198     if (flg) {
199       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);
200     }
201   }
202   ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
203                           "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
204   if (flg) {
205     ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
206   }
207   ierr = PetscOptionsEnd();CHKERRQ(ierr);
208   PetscFunctionReturn(0);
209 
210 }
211 
212 #undef __FUNCT__
213 #define __FUNCT__ "MatILUFactorSymbolic_SeqAIJCUSPARSE"
214 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
215 {
216   PetscErrorCode ierr;
217 
218   PetscFunctionBegin;
219   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
220   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
221   PetscFunctionReturn(0);
222 }
223 
224 #undef __FUNCT__
225 #define __FUNCT__ "MatLUFactorSymbolic_SeqAIJCUSPARSE"
226 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
227 {
228   PetscErrorCode ierr;
229 
230   PetscFunctionBegin;
231   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
232   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
233   PetscFunctionReturn(0);
234 }
235 
236 #undef __FUNCT__
237 #define __FUNCT__ "MatICCFactorSymbolic_SeqAIJCUSPARSE"
238 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
239 {
240   PetscErrorCode ierr;
241 
242   PetscFunctionBegin;
243   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
244   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
245   PetscFunctionReturn(0);
246 }
247 
248 #undef __FUNCT__
249 #define __FUNCT__ "MatCholeskyFactorSymbolic_SeqAIJCUSPARSE"
250 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
251 {
252   PetscErrorCode ierr;
253 
254   PetscFunctionBegin;
255   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
256   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
257   PetscFunctionReturn(0);
258 }
259 
260 #undef __FUNCT__
261 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildILULowerTriMatrix"
262 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
263 {
264   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
265   PetscInt                          n = A->rmap->n;
266   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
267   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
268   cusparseStatus_t                  stat;
269   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
270   const MatScalar                   *aa = a->a,*v;
271   PetscInt                          *AiLo, *AjLo;
272   PetscScalar                       *AALo;
273   PetscInt                          i,nz, nzLower, offset, rowOffset;
274   PetscErrorCode                    ierr;
275 
276   PetscFunctionBegin;
277   if (A->valid_GPU_matrix == PETSC_CUDA_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUDA_CPU) {
278     try {
279       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
280       nzLower=n+ai[n]-ai[1];
281 
282       /* Allocate Space for the lower triangular matrix */
283       ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
284       ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(ierr);
285       ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(ierr);
286 
287       /* Fill the lower triangular matrix */
288       AiLo[0]  = (PetscInt) 0;
289       AiLo[n]  = nzLower;
290       AjLo[0]  = (PetscInt) 0;
291       AALo[0]  = (MatScalar) 1.0;
292       v        = aa;
293       vi       = aj;
294       offset   = 1;
295       rowOffset= 1;
296       for (i=1; i<n; i++) {
297         nz = ai[i+1] - ai[i];
298         /* additional 1 for the term on the diagonal */
299         AiLo[i]    = rowOffset;
300         rowOffset += nz+1;
301 
302         ierr = PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr);
303         ierr = PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
304 
305         offset      += nz;
306         AjLo[offset] = (PetscInt) i;
307         AALo[offset] = (MatScalar) 1.0;
308         offset      += 1;
309 
310         v  += nz;
311         vi += nz;
312       }
313 
314       /* allocate space for the triangular factor information */
315       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
316 
317       /* Create the matrix description */
318       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat);
319       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
320       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
321       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUDA(stat);
322       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat);
323 
324       /* Create the solve analysis information */
325       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat);
326 
327       /* set the operation */
328       loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
329 
330       /* set the matrix */
331       loTriFactor->csrMat = new CsrMatrix;
332       loTriFactor->csrMat->num_rows = n;
333       loTriFactor->csrMat->num_cols = n;
334       loTriFactor->csrMat->num_entries = nzLower;
335 
336       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
337       loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
338 
339       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
340       loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
341 
342       loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
343       loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
344 
345       /* perform the solve analysis */
346       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
347                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
348                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
349                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat);
350 
351       /* assign the pointer. Is this really necessary? */
352       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
353 
354       ierr = cudaFreeHost(AiLo);CHKERRCUDA(ierr);
355       ierr = cudaFreeHost(AjLo);CHKERRCUDA(ierr);
356       ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr);
357     } catch(char *ex) {
358       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
359     }
360   }
361   PetscFunctionReturn(0);
362 }
363 
364 #undef __FUNCT__
365 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildILUUpperTriMatrix"
366 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
367 {
368   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
369   PetscInt                          n = A->rmap->n;
370   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
371   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
372   cusparseStatus_t                  stat;
373   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
374   const MatScalar                   *aa = a->a,*v;
375   PetscInt                          *AiUp, *AjUp;
376   PetscScalar                       *AAUp;
377   PetscInt                          i,nz, nzUpper, offset;
378   PetscErrorCode                    ierr;
379 
380   PetscFunctionBegin;
381   if (A->valid_GPU_matrix == PETSC_CUDA_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUDA_CPU) {
382     try {
383       /* next, figure out the number of nonzeros in the upper triangular matrix. */
384       nzUpper = adiag[0]-adiag[n];
385 
386       /* Allocate Space for the upper triangular matrix */
387       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
388       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr);
389       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
390 
391       /* Fill the upper triangular matrix */
392       AiUp[0]=(PetscInt) 0;
393       AiUp[n]=nzUpper;
394       offset = nzUpper;
395       for (i=n-1; i>=0; i--) {
396         v  = aa + adiag[i+1] + 1;
397         vi = aj + adiag[i+1] + 1;
398 
399         /* number of elements NOT on the diagonal */
400         nz = adiag[i] - adiag[i+1]-1;
401 
402         /* decrement the offset */
403         offset -= (nz+1);
404 
405         /* first, set the diagonal elements */
406         AjUp[offset] = (PetscInt) i;
407         AAUp[offset] = (MatScalar)1./v[nz];
408         AiUp[i]      = AiUp[i+1] - (nz+1);
409 
410         ierr = PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr);
411         ierr = PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
412       }
413 
414       /* allocate space for the triangular factor information */
415       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
416 
417       /* Create the matrix description */
418       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat);
419       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
420       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
421       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
422       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat);
423 
424       /* Create the solve analysis information */
425       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat);
426 
427       /* set the operation */
428       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
429 
430       /* set the matrix */
431       upTriFactor->csrMat = new CsrMatrix;
432       upTriFactor->csrMat->num_rows = n;
433       upTriFactor->csrMat->num_cols = n;
434       upTriFactor->csrMat->num_entries = nzUpper;
435 
436       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
437       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
438 
439       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
440       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
441 
442       upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
443       upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
444 
445       /* perform the solve analysis */
446       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
447                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
448                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
449                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat);
450 
451       /* assign the pointer. Is this really necessary? */
452       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
453 
454       ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr);
455       ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr);
456       ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr);
457     } catch(char *ex) {
458       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
459     }
460   }
461   PetscFunctionReturn(0);
462 }
463 
464 #undef __FUNCT__
465 #define __FUNCT__ "MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU"
466 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
467 {
468   PetscErrorCode               ierr;
469   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
470   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
471   IS                           isrow = a->row,iscol = a->icol;
472   PetscBool                    row_identity,col_identity;
473   const PetscInt               *r,*c;
474   PetscInt                     n = A->rmap->n;
475 
476   PetscFunctionBegin;
477   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
478   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
479 
480   cusparseTriFactors->workVector = new THRUSTARRAY;
481   cusparseTriFactors->workVector->resize(n);
482   cusparseTriFactors->nnz=a->nz;
483 
484   A->valid_GPU_matrix = PETSC_CUDA_BOTH;
485   /*lower triangular indices */
486   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
487   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
488   if (!row_identity) {
489     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
490     cusparseTriFactors->rpermIndices->assign(r, r+n);
491   }
492   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
493 
494   /*upper triangular indices */
495   ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
496   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
497   if (!col_identity) {
498     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
499     cusparseTriFactors->cpermIndices->assign(c, c+n);
500   }
501   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
502   PetscFunctionReturn(0);
503 }
504 
505 #undef __FUNCT__
506 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildICCTriMatrices"
507 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
508 {
509   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
510   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
511   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
512   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
513   cusparseStatus_t                  stat;
514   PetscErrorCode                    ierr;
515   PetscInt                          *AiUp, *AjUp;
516   PetscScalar                       *AAUp;
517   PetscScalar                       *AALo;
518   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
519   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
520   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
521   const MatScalar                   *aa = b->a,*v;
522 
523   PetscFunctionBegin;
524   if (A->valid_GPU_matrix == PETSC_CUDA_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUDA_CPU) {
525     try {
526       /* Allocate Space for the upper triangular matrix */
527       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
528       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr);
529       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
530       ierr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
531 
532       /* Fill the upper triangular matrix */
533       AiUp[0]=(PetscInt) 0;
534       AiUp[n]=nzUpper;
535       offset = 0;
536       for (i=0; i<n; i++) {
537         /* set the pointers */
538         v  = aa + ai[i];
539         vj = aj + ai[i];
540         nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
541 
542         /* first, set the diagonal elements */
543         AjUp[offset] = (PetscInt) i;
544         AAUp[offset] = (MatScalar)1.0/v[nz];
545         AiUp[i]      = offset;
546         AALo[offset] = (MatScalar)1.0/v[nz];
547 
548         offset+=1;
549         if (nz>0) {
550           ierr = PetscMemcpy(&(AjUp[offset]), vj, nz*sizeof(PetscInt));CHKERRQ(ierr);
551           ierr = PetscMemcpy(&(AAUp[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
552           for (j=offset; j<offset+nz; j++) {
553             AAUp[j] = -AAUp[j];
554             AALo[j] = AAUp[j]/v[nz];
555           }
556           offset+=nz;
557         }
558       }
559 
560       /* allocate space for the triangular factor information */
561       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
562 
563       /* Create the matrix description */
564       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat);
565       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
566       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
567       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
568       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat);
569 
570       /* Create the solve analysis information */
571       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat);
572 
573       /* set the operation */
574       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
575 
576       /* set the matrix */
577       upTriFactor->csrMat = new CsrMatrix;
578       upTriFactor->csrMat->num_rows = A->rmap->n;
579       upTriFactor->csrMat->num_cols = A->cmap->n;
580       upTriFactor->csrMat->num_entries = a->nz;
581 
582       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
583       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
584 
585       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
586       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
587 
588       upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
589       upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
590 
591       /* perform the solve analysis */
592       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
593                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
594                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
595                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat);
596 
597       /* assign the pointer. Is this really necessary? */
598       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
599 
600       /* allocate space for the triangular factor information */
601       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
602 
603       /* Create the matrix description */
604       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat);
605       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
606       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
607       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
608       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat);
609 
610       /* Create the solve analysis information */
611       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat);
612 
613       /* set the operation */
614       loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
615 
616       /* set the matrix */
617       loTriFactor->csrMat = new CsrMatrix;
618       loTriFactor->csrMat->num_rows = A->rmap->n;
619       loTriFactor->csrMat->num_cols = A->cmap->n;
620       loTriFactor->csrMat->num_entries = a->nz;
621 
622       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
623       loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
624 
625       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
626       loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
627 
628       loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
629       loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
630 
631       /* perform the solve analysis */
632       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
633                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
634                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
635                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat);
636 
637       /* assign the pointer. Is this really necessary? */
638       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
639 
640       A->valid_GPU_matrix = PETSC_CUDA_BOTH;
641       ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr);
642       ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr);
643       ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr);
644       ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr);
645     } catch(char *ex) {
646       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
647     }
648   }
649   PetscFunctionReturn(0);
650 }
651 
652 #undef __FUNCT__
653 #define __FUNCT__ "MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU"
654 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
655 {
656   PetscErrorCode               ierr;
657   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
658   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
659   IS                           ip = a->row;
660   const PetscInt               *rip;
661   PetscBool                    perm_identity;
662   PetscInt                     n = A->rmap->n;
663 
664   PetscFunctionBegin;
665   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
666   cusparseTriFactors->workVector = new THRUSTARRAY;
667   cusparseTriFactors->workVector->resize(n);
668   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
669 
670   /*lower triangular indices */
671   ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
672   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
673   if (!perm_identity) {
674     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
675     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
676     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
677     cusparseTriFactors->cpermIndices->assign(rip, rip+n);
678   }
679   ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
680   PetscFunctionReturn(0);
681 }
682 
683 #undef __FUNCT__
684 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJCUSPARSE"
685 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
686 {
687   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
688   IS             isrow = b->row,iscol = b->col;
689   PetscBool      row_identity,col_identity;
690   PetscErrorCode ierr;
691 
692   PetscFunctionBegin;
693   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
694   /* determine which version of MatSolve needs to be used. */
695   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
696   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
697   if (row_identity && col_identity) {
698     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
699     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
700   } else {
701     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
702     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
703   }
704 
705   /* get the triangular factors */
706   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
707   PetscFunctionReturn(0);
708 }
709 
710 #undef __FUNCT__
711 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqAIJCUSPARSE"
712 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
713 {
714   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
715   IS             ip = b->row;
716   PetscBool      perm_identity;
717   PetscErrorCode ierr;
718 
719   PetscFunctionBegin;
720   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
721 
722   /* determine which version of MatSolve needs to be used. */
723   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
724   if (perm_identity) {
725     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
726     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
727   } else {
728     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
729     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
730   }
731 
732   /* get the triangular factors */
733   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
734   PetscFunctionReturn(0);
735 }
736 
737 #undef __FUNCT__
738 #define __FUNCT__ "MatSeqAIJCUSPARSEAnalyzeTransposeForSolve"
739 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
740 {
741   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
742   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
743   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
744   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
745   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
746   cusparseStatus_t                  stat;
747   cusparseIndexBase_t               indexBase;
748   cusparseMatrixType_t              matrixType;
749   cusparseFillMode_t                fillMode;
750   cusparseDiagType_t                diagType;
751 
752   PetscFunctionBegin;
753 
754   /*********************************************/
755   /* Now the Transpose of the Lower Tri Factor */
756   /*********************************************/
757 
758   /* allocate space for the transpose of the lower triangular factor */
759   loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
760 
761   /* set the matrix descriptors of the lower triangular factor */
762   matrixType = cusparseGetMatType(loTriFactor->descr);
763   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
764   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
765     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
766   diagType = cusparseGetMatDiagType(loTriFactor->descr);
767 
768   /* Create the matrix description */
769   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUDA(stat);
770   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUDA(stat);
771   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUDA(stat);
772   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUDA(stat);
773   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUDA(stat);
774 
775   /* Create the solve analysis information */
776   stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUDA(stat);
777 
778   /* set the operation */
779   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
780 
781   /* allocate GPU space for the CSC of the lower triangular factor*/
782   loTriFactorT->csrMat = new CsrMatrix;
783   loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows;
784   loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols;
785   loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
786   loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1);
787   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries);
788   loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries);
789 
790   /* compute the transpose of the lower triangular factor, i.e. the CSC */
791   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
792                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
793                           loTriFactor->csrMat->values->data().get(),
794                           loTriFactor->csrMat->row_offsets->data().get(),
795                           loTriFactor->csrMat->column_indices->data().get(),
796                           loTriFactorT->csrMat->values->data().get(),
797                           loTriFactorT->csrMat->column_indices->data().get(),
798                           loTriFactorT->csrMat->row_offsets->data().get(),
799                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
800 
801   /* perform the solve analysis on the transposed matrix */
802   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
803                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries,
804                            loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
805                            loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(),
806                            loTriFactorT->solveInfo);CHKERRCUDA(stat);
807 
808   /* assign the pointer. Is this really necessary? */
809   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
810 
811   /*********************************************/
812   /* Now the Transpose of the Upper Tri Factor */
813   /*********************************************/
814 
815   /* allocate space for the transpose of the upper triangular factor */
816   upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
817 
818   /* set the matrix descriptors of the upper triangular factor */
819   matrixType = cusparseGetMatType(upTriFactor->descr);
820   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
821   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
822     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
823   diagType = cusparseGetMatDiagType(upTriFactor->descr);
824 
825   /* Create the matrix description */
826   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUDA(stat);
827   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUDA(stat);
828   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUDA(stat);
829   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUDA(stat);
830   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUDA(stat);
831 
832   /* Create the solve analysis information */
833   stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUDA(stat);
834 
835   /* set the operation */
836   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
837 
838   /* allocate GPU space for the CSC of the upper triangular factor*/
839   upTriFactorT->csrMat = new CsrMatrix;
840   upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows;
841   upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols;
842   upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
843   upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1);
844   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries);
845   upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries);
846 
847   /* compute the transpose of the upper triangular factor, i.e. the CSC */
848   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
849                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
850                           upTriFactor->csrMat->values->data().get(),
851                           upTriFactor->csrMat->row_offsets->data().get(),
852                           upTriFactor->csrMat->column_indices->data().get(),
853                           upTriFactorT->csrMat->values->data().get(),
854                           upTriFactorT->csrMat->column_indices->data().get(),
855                           upTriFactorT->csrMat->row_offsets->data().get(),
856                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
857 
858   /* perform the solve analysis on the transposed matrix */
859   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
860                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries,
861                            upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
862                            upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(),
863                            upTriFactorT->solveInfo);CHKERRCUDA(stat);
864 
865   /* assign the pointer. Is this really necessary? */
866   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
867   PetscFunctionReturn(0);
868 }
869 
870 #undef __FUNCT__
871 #define __FUNCT__ "MatSeqAIJCUSPARSEGenerateTransposeForMult"
872 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
873 {
874   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
875   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
876   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
877   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
878   cusparseStatus_t             stat;
879   cusparseIndexBase_t          indexBase;
880   cudaError_t                  err;
881 
882   PetscFunctionBegin;
883 
884   /* allocate space for the triangular factor information */
885   matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
886   stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUDA(stat);
887   indexBase = cusparseGetMatIndexBase(matstruct->descr);
888   stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUDA(stat);
889   stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat);
890 
891   /* set alpha and beta */
892   err = cudaMalloc((void **)&(matstructT->alpha),sizeof(PetscScalar));CHKERRCUDA(err);
893   err = cudaMemcpy(matstructT->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
894   err = cudaMalloc((void **)&(matstructT->beta),sizeof(PetscScalar));CHKERRCUDA(err);
895   err = cudaMemcpy(matstructT->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
896   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
897 
898   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
899     CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
900     CsrMatrix *matrixT= new CsrMatrix;
901     matrixT->num_rows = A->rmap->n;
902     matrixT->num_cols = A->cmap->n;
903     matrixT->num_entries = a->nz;
904     matrixT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
905     matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
906     matrixT->values = new THRUSTARRAY(a->nz);
907 
908     /* compute the transpose of the upper triangular factor, i.e. the CSC */
909     indexBase = cusparseGetMatIndexBase(matstruct->descr);
910     stat = cusparse_csr2csc(cusparsestruct->handle, matrix->num_rows,
911                             matrix->num_cols, matrix->num_entries,
912                             matrix->values->data().get(),
913                             matrix->row_offsets->data().get(),
914                             matrix->column_indices->data().get(),
915                             matrixT->values->data().get(),
916                             matrixT->column_indices->data().get(),
917                             matrixT->row_offsets->data().get(),
918                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
919 
920     /* assign the pointer */
921     matstructT->mat = matrixT;
922 
923   } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
924 #if CUDA_VERSION>=5000
925     /* First convert HYB to CSR */
926     CsrMatrix *temp= new CsrMatrix;
927     temp->num_rows = A->rmap->n;
928     temp->num_cols = A->cmap->n;
929     temp->num_entries = a->nz;
930     temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
931     temp->column_indices = new THRUSTINTARRAY32(a->nz);
932     temp->values = new THRUSTARRAY(a->nz);
933 
934 
935     stat = cusparse_hyb2csr(cusparsestruct->handle,
936                             matstruct->descr, (cusparseHybMat_t)matstruct->mat,
937                             temp->values->data().get(),
938                             temp->row_offsets->data().get(),
939                             temp->column_indices->data().get());CHKERRCUDA(stat);
940 
941     /* Next, convert CSR to CSC (i.e. the matrix transpose) */
942     CsrMatrix *tempT= new CsrMatrix;
943     tempT->num_rows = A->rmap->n;
944     tempT->num_cols = A->cmap->n;
945     tempT->num_entries = a->nz;
946     tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
947     tempT->column_indices = new THRUSTINTARRAY32(a->nz);
948     tempT->values = new THRUSTARRAY(a->nz);
949 
950     stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
951                             temp->num_cols, temp->num_entries,
952                             temp->values->data().get(),
953                             temp->row_offsets->data().get(),
954                             temp->column_indices->data().get(),
955                             tempT->values->data().get(),
956                             tempT->column_indices->data().get(),
957                             tempT->row_offsets->data().get(),
958                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
959 
960     /* Last, convert CSC to HYB */
961     cusparseHybMat_t hybMat;
962     stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat);
963     cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
964       CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
965     stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
966                             matstructT->descr, tempT->values->data().get(),
967                             tempT->row_offsets->data().get(),
968                             tempT->column_indices->data().get(),
969                             hybMat, 0, partition);CHKERRCUDA(stat);
970 
971     /* assign the pointer */
972     matstructT->mat = hybMat;
973 
974     /* delete temporaries */
975     if (tempT) {
976       if (tempT->values) delete (THRUSTARRAY*) tempT->values;
977       if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
978       if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
979       delete (CsrMatrix*) tempT;
980     }
981     if (temp) {
982       if (temp->values) delete (THRUSTARRAY*) temp->values;
983       if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
984       if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
985       delete (CsrMatrix*) temp;
986     }
987 #else
988     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format for the Matrix Transpose (in MatMultTranspose) require CUDA 5.0 or later.");
989 #endif
990   }
991   /* assign the compressed row indices */
992   matstructT->cprowIndices = new THRUSTINTARRAY;
993 
994   /* assign the pointer */
995   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
996   PetscFunctionReturn(0);
997 }
998 
999 #undef __FUNCT__
1000 #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE"
1001 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1002 {
1003   PetscInt                          n = xx->map->n;
1004   PetscScalar                       *xarray, *barray;
1005   thrust::device_ptr<PetscScalar>   xGPU,bGPU;
1006   cusparseStatus_t                  stat;
1007   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1008   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1009   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1010   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1011   PetscErrorCode                    ierr;
1012 
1013   PetscFunctionBegin;
1014   /* Analyze the matrix and create the transpose ... on the fly */
1015   if (!loTriFactorT && !upTriFactorT) {
1016     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1017     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1018     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1019   }
1020 
1021   /* Get the GPU pointers */
1022   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1023   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1024   xGPU = thrust::device_pointer_cast(xarray);
1025   bGPU = thrust::device_pointer_cast(barray);
1026 
1027   /* First, reorder with the row permutation */
1028   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1029                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1030                xGPU);
1031 
1032   /* First, solve U */
1033   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1034                         upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr,
1035                         upTriFactorT->csrMat->values->data().get(),
1036                         upTriFactorT->csrMat->row_offsets->data().get(),
1037                         upTriFactorT->csrMat->column_indices->data().get(),
1038                         upTriFactorT->solveInfo,
1039                         xarray, tempGPU->data().get());CHKERRCUDA(stat);
1040 
1041   /* Then, solve L */
1042   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1043                         loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr,
1044                         loTriFactorT->csrMat->values->data().get(),
1045                         loTriFactorT->csrMat->row_offsets->data().get(),
1046                         loTriFactorT->csrMat->column_indices->data().get(),
1047                         loTriFactorT->solveInfo,
1048                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1049 
1050   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1051   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1052                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1053                tempGPU->begin());
1054 
1055   /* Copy the temporary to the full solution. */
1056   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1057 
1058   /* restore */
1059   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1060   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1061   ierr = WaitForGPU();CHKERRCUDA(ierr);
1062 
1063   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1064   PetscFunctionReturn(0);
1065 }
1066 
1067 #undef __FUNCT__
1068 #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering"
1069 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1070 {
1071   PetscScalar                       *xarray, *barray;
1072   cusparseStatus_t                  stat;
1073   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1074   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1075   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1076   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1077   PetscErrorCode                    ierr;
1078 
1079   PetscFunctionBegin;
1080   /* Analyze the matrix and create the transpose ... on the fly */
1081   if (!loTriFactorT && !upTriFactorT) {
1082     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1083     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1084     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1085   }
1086 
1087   /* Get the GPU pointers */
1088   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1089   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1090 
1091   /* First, solve U */
1092   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1093                         upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr,
1094                         upTriFactorT->csrMat->values->data().get(),
1095                         upTriFactorT->csrMat->row_offsets->data().get(),
1096                         upTriFactorT->csrMat->column_indices->data().get(),
1097                         upTriFactorT->solveInfo,
1098                         barray, tempGPU->data().get());CHKERRCUDA(stat);
1099 
1100   /* Then, solve L */
1101   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1102                         loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr,
1103                         loTriFactorT->csrMat->values->data().get(),
1104                         loTriFactorT->csrMat->row_offsets->data().get(),
1105                         loTriFactorT->csrMat->column_indices->data().get(),
1106                         loTriFactorT->solveInfo,
1107                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1108 
1109   /* restore */
1110   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1111   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1112   ierr = WaitForGPU();CHKERRCUDA(ierr);
1113   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1114   PetscFunctionReturn(0);
1115 }
1116 
1117 #undef __FUNCT__
1118 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE"
1119 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1120 {
1121   PetscScalar                       *xarray, *barray;
1122   thrust::device_ptr<PetscScalar>   xGPU,bGPU;
1123   cusparseStatus_t                  stat;
1124   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1125   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1126   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1127   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1128   PetscErrorCode                    ierr;
1129   VecType                           t;
1130   PetscBool                         flg;
1131 
1132   PetscFunctionBegin;
1133   ierr = VecGetType(bb,&t);CHKERRQ(ierr);
1134   ierr = PetscStrcmp(t,VECSEQCUDA,&flg);CHKERRQ(ierr);
1135   if (!flg) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Vector of type %s passed into MatSolve_SeqAIJCUSPARSE (Arg #2). Can only deal with %s\n.",t,VECSEQCUDA);
1136   ierr = VecGetType(xx,&t);CHKERRQ(ierr);
1137   ierr = PetscStrcmp(t,VECSEQCUDA,&flg);CHKERRQ(ierr);
1138   if (!flg) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Vector of type %s passed into MatSolve_SeqAIJCUSPARSE (Arg #3). Can only deal with %s\n.",t,VECSEQCUDA);
1139 
1140   /* Get the GPU pointers */
1141   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1142   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1143   xGPU = thrust::device_pointer_cast(xarray);
1144   bGPU = thrust::device_pointer_cast(barray);
1145 
1146   /* First, reorder with the row permutation */
1147   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1148                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1149                xGPU);
1150 
1151   /* Next, solve L */
1152   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1153                         loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr,
1154                         loTriFactor->csrMat->values->data().get(),
1155                         loTriFactor->csrMat->row_offsets->data().get(),
1156                         loTriFactor->csrMat->column_indices->data().get(),
1157                         loTriFactor->solveInfo,
1158                         xarray, tempGPU->data().get());CHKERRCUDA(stat);
1159 
1160   /* Then, solve U */
1161   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1162                         upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr,
1163                         upTriFactor->csrMat->values->data().get(),
1164                         upTriFactor->csrMat->row_offsets->data().get(),
1165                         upTriFactor->csrMat->column_indices->data().get(),
1166                         upTriFactor->solveInfo,
1167                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1168 
1169   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1170   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1171                thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->end()),
1172                tempGPU->begin());
1173 
1174   /* Copy the temporary to the full solution. */
1175   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1176 
1177   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1178   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1179   ierr = WaitForGPU();CHKERRCUDA(ierr);
1180   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1181   PetscFunctionReturn(0);
1182 }
1183 
1184 #undef __FUNCT__
1185 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE_NaturalOrdering"
1186 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1187 {
1188   PetscScalar                       *xarray, *barray;
1189   cusparseStatus_t                  stat;
1190   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1191   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1192   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1193   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1194   PetscErrorCode                    ierr;
1195 
1196   PetscFunctionBegin;
1197   /* Get the GPU pointers */
1198   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1199   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1200 
1201   /* First, solve L */
1202   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1203                         loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr,
1204                         loTriFactor->csrMat->values->data().get(),
1205                         loTriFactor->csrMat->row_offsets->data().get(),
1206                         loTriFactor->csrMat->column_indices->data().get(),
1207                         loTriFactor->solveInfo,
1208                         barray, tempGPU->data().get());CHKERRCUDA(stat);
1209 
1210   /* Next, solve U */
1211   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1212                         upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr,
1213                         upTriFactor->csrMat->values->data().get(),
1214                         upTriFactor->csrMat->row_offsets->data().get(),
1215                         upTriFactor->csrMat->column_indices->data().get(),
1216                         upTriFactor->solveInfo,
1217                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1218 
1219   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1220   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1221   ierr = WaitForGPU();CHKERRCUDA(ierr);
1222   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1223   PetscFunctionReturn(0);
1224 }
1225 
1226 #undef __FUNCT__
1227 #define __FUNCT__ "MatSeqAIJCUSPARSECopyToGPU"
1228 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1229 {
1230 
1231   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1232   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1233   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1234   PetscInt                     m = A->rmap->n,*ii,*ridx;
1235   PetscErrorCode               ierr;
1236   cusparseStatus_t             stat;
1237   cudaError_t                  err;
1238 
1239   PetscFunctionBegin;
1240   if (A->valid_GPU_matrix == PETSC_CUDA_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUDA_CPU) {
1241     ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1242     Mat_SeqAIJCUSPARSEMultStruct_Destroy(&matstruct,cusparsestruct->format);
1243     try {
1244       cusparsestruct->nonzerorow=0;
1245       for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0);
1246 
1247       if (a->compressedrow.use) {
1248         m    = a->compressedrow.nrows;
1249         ii   = a->compressedrow.i;
1250         ridx = a->compressedrow.rindex;
1251       } else {
1252         /* Forcing compressed row on the GPU */
1253         int k=0;
1254         ierr = PetscMalloc1(cusparsestruct->nonzerorow+1, &ii);CHKERRQ(ierr);
1255         ierr = PetscMalloc1(cusparsestruct->nonzerorow, &ridx);CHKERRQ(ierr);
1256         ii[0]=0;
1257         for (int j = 0; j<m; j++) {
1258           if ((a->i[j+1]-a->i[j])>0) {
1259             ii[k]  = a->i[j];
1260             ridx[k]= j;
1261             k++;
1262           }
1263         }
1264         ii[cusparsestruct->nonzerorow] = a->nz;
1265         m = cusparsestruct->nonzerorow;
1266       }
1267 
1268       /* allocate space for the triangular factor information */
1269       matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1270       stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUDA(stat);
1271       stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
1272       stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat);
1273 
1274       err = cudaMalloc((void **)&(matstruct->alpha),sizeof(PetscScalar));CHKERRCUDA(err);
1275       err = cudaMemcpy(matstruct->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1276       err = cudaMalloc((void **)&(matstruct->beta),sizeof(PetscScalar));CHKERRCUDA(err);
1277       err = cudaMemcpy(matstruct->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1278       stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
1279 
1280       /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1281       if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1282 /* set the matrix */
1283         CsrMatrix *matrix= new CsrMatrix;
1284         matrix->num_rows = m;
1285         matrix->num_cols = A->cmap->n;
1286         matrix->num_entries = a->nz;
1287         matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1288         matrix->row_offsets->assign(ii, ii + m+1);
1289 
1290         matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1291         matrix->column_indices->assign(a->j, a->j+a->nz);
1292 
1293         matrix->values = new THRUSTARRAY(a->nz);
1294         matrix->values->assign(a->a, a->a+a->nz);
1295 
1296 /* assign the pointer */
1297         matstruct->mat = matrix;
1298 
1299       } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1300 #if CUDA_VERSION>=4020
1301         CsrMatrix *matrix= new CsrMatrix;
1302         matrix->num_rows = m;
1303         matrix->num_cols = A->cmap->n;
1304         matrix->num_entries = a->nz;
1305         matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1306         matrix->row_offsets->assign(ii, ii + m+1);
1307 
1308         matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1309         matrix->column_indices->assign(a->j, a->j+a->nz);
1310 
1311         matrix->values = new THRUSTARRAY(a->nz);
1312         matrix->values->assign(a->a, a->a+a->nz);
1313 
1314         cusparseHybMat_t hybMat;
1315         stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat);
1316         cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1317           CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1318         stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols,
1319                                 matstruct->descr, matrix->values->data().get(),
1320                                 matrix->row_offsets->data().get(),
1321                                 matrix->column_indices->data().get(),
1322                                 hybMat, 0, partition);CHKERRCUDA(stat);
1323         /* assign the pointer */
1324         matstruct->mat = hybMat;
1325 
1326         if (matrix) {
1327           if (matrix->values) delete (THRUSTARRAY*)matrix->values;
1328           if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices;
1329           if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets;
1330           delete (CsrMatrix*)matrix;
1331         }
1332 #endif
1333       }
1334 
1335       /* assign the compressed row indices */
1336       matstruct->cprowIndices = new THRUSTINTARRAY(m);
1337       matstruct->cprowIndices->assign(ridx,ridx+m);
1338 
1339       /* assign the pointer */
1340       cusparsestruct->mat = matstruct;
1341 
1342       if (!a->compressedrow.use) {
1343         ierr = PetscFree(ii);CHKERRQ(ierr);
1344         ierr = PetscFree(ridx);CHKERRQ(ierr);
1345       }
1346       cusparsestruct->workVector = new THRUSTARRAY;
1347       cusparsestruct->workVector->resize(m);
1348     } catch(char *ex) {
1349       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1350     }
1351     ierr = WaitForGPU();CHKERRCUDA(ierr);
1352 
1353     A->valid_GPU_matrix = PETSC_CUDA_BOTH;
1354 
1355     ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1356   }
1357   PetscFunctionReturn(0);
1358 }
1359 
1360 #undef __FUNCT__
1361 #define __FUNCT__ "MatCreateVecs_SeqAIJCUSPARSE"
1362 static PetscErrorCode MatCreateVecs_SeqAIJCUSPARSE(Mat mat, Vec *right, Vec *left)
1363 {
1364   PetscErrorCode ierr;
1365   PetscInt rbs,cbs;
1366 
1367   PetscFunctionBegin;
1368   ierr = MatGetBlockSizes(mat,&rbs,&cbs);CHKERRQ(ierr);
1369   if (right) {
1370     ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr);
1371     ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
1372     ierr = VecSetBlockSize(*right,cbs);CHKERRQ(ierr);
1373     ierr = VecSetType(*right,VECSEQCUDA);CHKERRQ(ierr);
1374     ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr);
1375   }
1376   if (left) {
1377     ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr);
1378     ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
1379     ierr = VecSetBlockSize(*left,rbs);CHKERRQ(ierr);
1380     ierr = VecSetType(*left,VECSEQCUDA);CHKERRQ(ierr);
1381     ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr);
1382   }
1383   PetscFunctionReturn(0);
1384 }
1385 
1386 struct VecCUDAPlusEquals
1387 {
1388   template <typename Tuple>
1389   __host__ __device__
1390   void operator()(Tuple t)
1391   {
1392     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1393   }
1394 };
1395 
1396 #undef __FUNCT__
1397 #define __FUNCT__ "MatMult_SeqAIJCUSPARSE"
1398 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1399 {
1400   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1401   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1402   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1403   PetscScalar                  *xarray,*yarray;
1404   PetscErrorCode               ierr;
1405   cusparseStatus_t             stat;
1406 
1407   PetscFunctionBegin;
1408   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
1409      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
1410   ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1411   ierr = VecCUDAGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
1412   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1413     CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1414     stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1415                              mat->num_rows, mat->num_cols, mat->num_entries,
1416                              matstruct->alpha, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(),
1417                              mat->column_indices->data().get(), xarray, matstruct->beta,
1418                              yarray);CHKERRCUDA(stat);
1419   } else {
1420 #if CUDA_VERSION>=4020
1421     cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1422     stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1423                              matstruct->alpha, matstruct->descr, hybMat,
1424                              xarray, matstruct->beta,
1425                              yarray);CHKERRCUDA(stat);
1426 #endif
1427   }
1428   ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1429   ierr = VecCUDARestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
1430   if (!cusparsestruct->stream) {
1431     ierr = WaitForGPU();CHKERRCUDA(ierr);
1432   }
1433   ierr = PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr);
1434   PetscFunctionReturn(0);
1435 }
1436 
1437 #undef __FUNCT__
1438 #define __FUNCT__ "MatMultTranspose_SeqAIJCUSPARSE"
1439 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1440 {
1441   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1442   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1443   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1444   PetscScalar                  *xarray,*yarray;
1445   PetscErrorCode               ierr;
1446   cusparseStatus_t             stat;
1447 
1448   PetscFunctionBegin;
1449   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
1450      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
1451   if (!matstructT) {
1452     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1453     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1454   }
1455   ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1456   ierr = VecCUDAGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
1457 
1458   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1459     CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1460     stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1461                              mat->num_rows, mat->num_cols,
1462                              mat->num_entries, matstructT->alpha, matstructT->descr,
1463                              mat->values->data().get(), mat->row_offsets->data().get(),
1464                              mat->column_indices->data().get(), xarray, matstructT->beta,
1465                              yarray);CHKERRCUDA(stat);
1466   } else {
1467 #if CUDA_VERSION>=4020
1468     cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1469     stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1470                              matstructT->alpha, matstructT->descr, hybMat,
1471                              xarray, matstructT->beta,
1472                              yarray);CHKERRCUDA(stat);
1473 #endif
1474   }
1475   ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1476   ierr = VecCUDARestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
1477   if (!cusparsestruct->stream) {
1478     ierr = WaitForGPU();CHKERRCUDA(ierr);
1479   }
1480   ierr = PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr);
1481   PetscFunctionReturn(0);
1482 }
1483 
1484 
1485 #undef __FUNCT__
1486 #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE"
1487 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1488 {
1489   Mat_SeqAIJ                      *a = (Mat_SeqAIJ*)A->data;
1490   Mat_SeqAIJCUSPARSE              *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1491   Mat_SeqAIJCUSPARSEMultStruct    *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1492   thrust::device_ptr<PetscScalar> zptr;
1493   PetscScalar                     *xarray,*zarray;
1494   PetscErrorCode                  ierr;
1495   cusparseStatus_t                stat;
1496 
1497   PetscFunctionBegin;
1498   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
1499      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
1500   try {
1501     ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);
1502     ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1503     ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
1504     zptr = thrust::device_pointer_cast(zarray);
1505 
1506     /* multiply add */
1507     if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1508       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1509     /* here we need to be careful to set the number of rows in the multiply to the
1510        number of compressed rows in the matrix ... which is equivalent to the
1511        size of the workVector */
1512       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1513                                mat->num_rows, mat->num_cols,
1514                                mat->num_entries, matstruct->alpha, matstruct->descr,
1515                                mat->values->data().get(), mat->row_offsets->data().get(),
1516                                mat->column_indices->data().get(), xarray, matstruct->beta,
1517                                cusparsestruct->workVector->data().get());CHKERRCUDA(stat);
1518     } else {
1519 #if CUDA_VERSION>=4020
1520       cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1521       if (cusparsestruct->workVector->size()) {
1522         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1523             matstruct->alpha, matstruct->descr, hybMat,
1524             xarray, matstruct->beta,
1525             cusparsestruct->workVector->data().get());CHKERRCUDA(stat);
1526       }
1527 #endif
1528     }
1529 
1530     /* scatter the data from the temporary into the full vector with a += operation */
1531     thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
1532         thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(),
1533         VecCUDAPlusEquals());
1534     ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1535     ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
1536 
1537   } catch(char *ex) {
1538     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1539   }
1540   ierr = WaitForGPU();CHKERRCUDA(ierr);
1541   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1542   PetscFunctionReturn(0);
1543 }
1544 
1545 #undef __FUNCT__
1546 #define __FUNCT__ "MatMultTransposeAdd_SeqAIJCUSPARSE"
1547 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1548 {
1549   Mat_SeqAIJ                      *a = (Mat_SeqAIJ*)A->data;
1550   Mat_SeqAIJCUSPARSE              *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1551   Mat_SeqAIJCUSPARSEMultStruct    *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1552   thrust::device_ptr<PetscScalar> zptr;
1553   PetscScalar                     *xarray,*zarray;
1554   PetscErrorCode                  ierr;
1555   cusparseStatus_t                stat;
1556 
1557   PetscFunctionBegin;
1558   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
1559      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
1560   if (!matstructT) {
1561     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1562     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1563   }
1564 
1565   try {
1566     ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);
1567     ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1568     ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
1569     zptr = thrust::device_pointer_cast(zarray);
1570 
1571     /* multiply add with matrix transpose */
1572     if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1573       CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1574       /* here we need to be careful to set the number of rows in the multiply to the
1575          number of compressed rows in the matrix ... which is equivalent to the
1576          size of the workVector */
1577       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1578                                mat->num_rows, mat->num_cols,
1579                                mat->num_entries, matstructT->alpha, matstructT->descr,
1580                                mat->values->data().get(), mat->row_offsets->data().get(),
1581                                mat->column_indices->data().get(), xarray, matstructT->beta,
1582                                cusparsestruct->workVector->data().get());CHKERRCUDA(stat);
1583     } else {
1584 #if CUDA_VERSION>=4020
1585       cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1586       if (cusparsestruct->workVector->size()) {
1587         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1588             matstructT->alpha, matstructT->descr, hybMat,
1589             xarray, matstructT->beta,
1590             cusparsestruct->workVector->data().get());CHKERRCUDA(stat);
1591       }
1592 #endif
1593     }
1594 
1595     /* scatter the data from the temporary into the full vector with a += operation */
1596     thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))),
1597         thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))) + cusparsestruct->workVector->size(),
1598         VecCUDAPlusEquals());
1599 
1600     ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1601     ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
1602 
1603   } catch(char *ex) {
1604     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1605   }
1606   ierr = WaitForGPU();CHKERRCUDA(ierr);
1607   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1608   PetscFunctionReturn(0);
1609 }
1610 
1611 #undef __FUNCT__
1612 #define __FUNCT__ "MatAssemblyEnd_SeqAIJCUSPARSE"
1613 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
1614 {
1615   PetscErrorCode ierr;
1616 
1617   PetscFunctionBegin;
1618   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
1619   if (A->factortype==MAT_FACTOR_NONE) {
1620     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1621   }
1622   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
1623   A->ops->mult             = MatMult_SeqAIJCUSPARSE;
1624   A->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1625   A->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1626   A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1627   PetscFunctionReturn(0);
1628 }
1629 
1630 /* --------------------------------------------------------------------------------*/
1631 #undef __FUNCT__
1632 #define __FUNCT__ "MatCreateSeqAIJCUSPARSE"
1633 /*@
1634    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
1635    (the default parallel PETSc format). This matrix will ultimately pushed down
1636    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
1637    assembly performance the user should preallocate the matrix storage by setting
1638    the parameter nz (or the array nnz).  By setting these parameters accurately,
1639    performance during matrix assembly can be increased by more than a factor of 50.
1640 
1641    Collective on MPI_Comm
1642 
1643    Input Parameters:
1644 +  comm - MPI communicator, set to PETSC_COMM_SELF
1645 .  m - number of rows
1646 .  n - number of columns
1647 .  nz - number of nonzeros per row (same for all rows)
1648 -  nnz - array containing the number of nonzeros in the various rows
1649          (possibly different for each row) or NULL
1650 
1651    Output Parameter:
1652 .  A - the matrix
1653 
1654    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1655    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1656    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
1657 
1658    Notes:
1659    If nnz is given then nz is ignored
1660 
1661    The AIJ format (also called the Yale sparse matrix format or
1662    compressed row storage), is fully compatible with standard Fortran 77
1663    storage.  That is, the stored row and column indices can begin at
1664    either one (as in Fortran) or zero.  See the users' manual for details.
1665 
1666    Specify the preallocated storage with either nz or nnz (not both).
1667    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1668    allocation.  For large problems you MUST preallocate memory or you
1669    will get TERRIBLE performance, see the users' manual chapter on matrices.
1670 
1671    By default, this format uses inodes (identical nodes) when possible, to
1672    improve numerical efficiency of matrix-vector products and solves. We
1673    search for consecutive rows with the same nonzero structure, thereby
1674    reusing matrix information to achieve increased efficiency.
1675 
1676    Level: intermediate
1677 
1678 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
1679 @*/
1680 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1681 {
1682   PetscErrorCode ierr;
1683 
1684   PetscFunctionBegin;
1685   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1686   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
1687   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1688   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
1689   PetscFunctionReturn(0);
1690 }
1691 
1692 #undef __FUNCT__
1693 #define __FUNCT__ "MatDestroy_SeqAIJCUSPARSE"
1694 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
1695 {
1696   PetscErrorCode   ierr;
1697 
1698   PetscFunctionBegin;
1699   if (A->factortype==MAT_FACTOR_NONE) {
1700     if (A->valid_GPU_matrix != PETSC_CUDA_UNALLOCATED) {
1701       ierr = Mat_SeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
1702     }
1703   } else {
1704     ierr = Mat_SeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
1705   }
1706   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
1707   PetscFunctionReturn(0);
1708 }
1709 
1710 #undef __FUNCT__
1711 #define __FUNCT__ "MatCreate_SeqAIJCUSPARSE"
1712 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
1713 {
1714   PetscErrorCode ierr;
1715   cusparseStatus_t stat;
1716   cusparseHandle_t handle=0;
1717 
1718   PetscFunctionBegin;
1719   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
1720   if (B->factortype==MAT_FACTOR_NONE) {
1721     /* you cannot check the inode.use flag here since the matrix was just created.
1722        now build a GPU matrix data structure */
1723     B->spptr = new Mat_SeqAIJCUSPARSE;
1724     ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat          = 0;
1725     ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0;
1726     ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector   = 0;
1727     ((Mat_SeqAIJCUSPARSE*)B->spptr)->format       = MAT_CUSPARSE_CSR;
1728     ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream       = 0;
1729     ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle       = 0;
1730     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1731     ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle       = handle;
1732     ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream       = 0;
1733   } else {
1734     /* NEXT, set the pointers to the triangular factors */
1735     B->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1736     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr          = 0;
1737     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr          = 0;
1738     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0;
1739     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0;
1740     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices            = 0;
1741     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices            = 0;
1742     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector              = 0;
1743     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle                  = 0;
1744     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1745     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle                  = handle;
1746     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz                     = 0;
1747   }
1748 
1749   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
1750   B->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
1751   B->ops->getvecs          = MatCreateVecs_SeqAIJCUSPARSE;
1752   B->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
1753   B->ops->mult             = MatMult_SeqAIJCUSPARSE;
1754   B->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1755   B->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1756   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1757 
1758   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1759 
1760   B->valid_GPU_matrix = PETSC_CUDA_UNALLOCATED;
1761 
1762   ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
1763   PetscFunctionReturn(0);
1764 }
1765 
1766 /*M
1767    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
1768 
1769    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
1770    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
1771    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
1772 
1773    Options Database Keys:
1774 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
1775 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1776 .  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1777 
1778   Level: beginner
1779 
1780 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
1781 M*/
1782 
1783 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);
1784 
1785 
1786 #undef __FUNCT__
1787 #define __FUNCT__ "MatSolverPackageRegister_CUSPARSE"
1788 PETSC_EXTERN PetscErrorCode MatSolverPackageRegister_CUSPARSE(void)
1789 {
1790   PetscErrorCode ierr;
1791 
1792   PetscFunctionBegin;
1793   ierr = MatSolverPackageRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1794   ierr = MatSolverPackageRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1795   ierr = MatSolverPackageRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1796   ierr = MatSolverPackageRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1797   PetscFunctionReturn(0);
1798 }
1799 
1800 
1801 #undef __FUNCT__
1802 #define __FUNCT__ "Mat_SeqAIJCUSPARSE_Destroy"
1803 static PetscErrorCode Mat_SeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
1804 {
1805   cusparseStatus_t stat;
1806   cusparseHandle_t handle;
1807 
1808   PetscFunctionBegin;
1809   if (*cusparsestruct) {
1810     Mat_SeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);
1811     Mat_SeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);
1812     delete (*cusparsestruct)->workVector;
1813     if (handle = (*cusparsestruct)->handle) {
1814       stat = cusparseDestroy(handle);CHKERRCUDA(stat);
1815     }
1816     delete *cusparsestruct;
1817     *cusparsestruct = 0;
1818   }
1819   PetscFunctionReturn(0);
1820 }
1821 
1822 #undef __FUNCT__
1823 #define __FUNCT__ "CsrMatrix_Destroy"
1824 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
1825 {
1826   PetscFunctionBegin;
1827   if (*mat) {
1828     delete (*mat)->values;
1829     delete (*mat)->column_indices;
1830     delete (*mat)->row_offsets;
1831     delete *mat;
1832     *mat = 0;
1833   }
1834   PetscFunctionReturn(0);
1835 }
1836 
1837 #undef __FUNCT__
1838 #define __FUNCT__ "Mat_SeqAIJCUSPARSETriFactorStruct_Destroy"
1839 static PetscErrorCode Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
1840 {
1841   cusparseStatus_t stat;
1842   PetscErrorCode   ierr;
1843 
1844   PetscFunctionBegin;
1845   if (*trifactor) {
1846     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUDA(stat); }
1847     if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUDA(stat); }
1848     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
1849     delete *trifactor;
1850     *trifactor = 0;
1851   }
1852   PetscFunctionReturn(0);
1853 }
1854 
1855 #undef __FUNCT__
1856 #define __FUNCT__ "Mat_SeqAIJCUSPARSEMultStruct_Destroy"
1857 static PetscErrorCode Mat_SeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
1858 {
1859   CsrMatrix        *mat;
1860   cusparseStatus_t stat;
1861   cudaError_t      err;
1862 
1863   PetscFunctionBegin;
1864   if (*matstruct) {
1865     if ((*matstruct)->mat) {
1866       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
1867         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
1868         stat = cusparseDestroyHybMat(hybMat);CHKERRCUDA(stat);
1869       } else {
1870         mat = (CsrMatrix*)(*matstruct)->mat;
1871         CsrMatrix_Destroy(&mat);
1872       }
1873     }
1874     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUDA(stat); }
1875     delete (*matstruct)->cprowIndices;
1876     if ((*matstruct)->alpha) { err=cudaFree((*matstruct)->alpha);CHKERRCUDA(err); }
1877     if ((*matstruct)->beta) { err=cudaFree((*matstruct)->beta);CHKERRCUDA(err); }
1878     delete *matstruct;
1879     *matstruct = 0;
1880   }
1881   PetscFunctionReturn(0);
1882 }
1883 
1884 #undef __FUNCT__
1885 #define __FUNCT__ "Mat_SeqAIJCUSPARSETriFactors_Destroy"
1886 static PetscErrorCode Mat_SeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
1887 {
1888   cusparseHandle_t handle;
1889   cusparseStatus_t stat;
1890 
1891   PetscFunctionBegin;
1892   if (*trifactors) {
1893     Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->loTriFactorPtr);
1894     Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->upTriFactorPtr);
1895     Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);
1896     Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);
1897     delete (*trifactors)->rpermIndices;
1898     delete (*trifactors)->cpermIndices;
1899     delete (*trifactors)->workVector;
1900     if (handle = (*trifactors)->handle) {
1901       stat = cusparseDestroy(handle);CHKERRCUDA(stat);
1902     }
1903     delete *trifactors;
1904     *trifactors = 0;
1905   }
1906   PetscFunctionReturn(0);
1907 }
1908 
1909