xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision aa372e3fe8df1a25e86bd398820fb3017c3d0016)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 
6 #include "petscconf.h"
7 #include "../src/mat/impls/aij/seq/aij.h"          /*I "petscmat.h" I*/
8 #include <../src/mat/impls/sbaij/seq/sbaij.h>
9 #include "../src/vec/vec/impls/dvecimpl.h"
10 #include "petsc-private/vecimpl.h"
11 #undef VecType
12 #include "cusparsematimpl.h"
13 
14 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
15 
16 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
17 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
18 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
19 
20 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
21 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
22 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
23 
24 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
25 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
26 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
27 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
28 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat);
29 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
30 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
31 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
32 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
33 
34 #undef __FUNCT__
35 #define __FUNCT__ "MatFactorGetSolverPackage_seqaij_cusparse"
36 PetscErrorCode MatFactorGetSolverPackage_seqaij_cusparse(Mat A,const MatSolverPackage *type)
37 {
38   PetscFunctionBegin;
39   *type = MATSOLVERCUSPARSE;
40   PetscFunctionReturn(0);
41 }
42 
43 /*MC
44   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
45   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
46   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
47   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
48   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
49   algorithms are not recommended. This class does NOT support direct solver operations.
50 
51   Consult CUSPARSE documentation for more information about the matrix storage formats
52   which correspond to the options database keys below.
53 
54    Options Database Keys:
55 .  -mat_cusparse_solve_storage_format csr - sets the storage format matrices (for factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
56 
57   Level: beginner
58 
59 .seealso: PCFactorSetMatSolverPackage(), MatSolverPackage, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
60 M*/
61 
62 #undef __FUNCT__
63 #define __FUNCT__ "MatGetFactor_seqaij_cusparse"
64 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaij_cusparse(Mat A,MatFactorType ftype,Mat *B)
65 {
66   PetscErrorCode ierr;
67   PetscInt       n = A->rmap->n;
68 
69   PetscFunctionBegin;
70   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
71   (*B)->factortype = ftype;
72   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
73   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
74 
75   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
76     ierr = MatSetBlockSizes(*B,A->rmap->bs,A->cmap->bs);CHKERRQ(ierr);
77     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
78     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
79   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
80     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
81     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
82   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
83 
84   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
85   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverPackage_C",MatFactorGetSolverPackage_seqaij_cusparse);CHKERRQ(ierr);
86   PetscFunctionReturn(0);
87 }
88 
89 #undef __FUNCT__
90 #define __FUNCT__ "MatCUSPARSESetFormat_SeqAIJCUSPARSE"
91 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
92 {
93   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
94 
95   PetscFunctionBegin;
96   switch (op) {
97   case MAT_CUSPARSE_MULT:
98     cusparsestruct->format = format;
99     break;
100   case MAT_CUSPARSE_SOLVE:
101     cusparseMatSolveStorageFormat = format;
102     break;
103   case MAT_CUSPARSE_ALL:
104     cusparsestruct->format = format;
105     cusparseMatSolveStorageFormat = format;
106     break;
107   default:
108     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT, MAT_CUSPARSE_SOLVE, and MAT_CUSPARSE_ALL are currently supported.",op);
109   }
110   PetscFunctionReturn(0);
111 }
112 
113 /*@
114    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
115    operation. Only the MatMult operation can use different GPU storage formats
116    for MPIAIJCUSPARSE matrices.
117    Not Collective
118 
119    Input Parameters:
120 +  A - Matrix of type SEQAIJCUSPARSE
121 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT, MAT_CUSPARSE_SOLVE, and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
122 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB)
123 
124    Output Parameter:
125 
126    Level: intermediate
127 
128 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
129 @*/
130 #undef __FUNCT__
131 #define __FUNCT__ "MatCUSPARSESetFormat"
132 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
133 {
134   PetscErrorCode ierr;
135 
136   PetscFunctionBegin;
137   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
138   ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
139   PetscFunctionReturn(0);
140 }
141 
142 #undef __FUNCT__
143 #define __FUNCT__ "MatSetFromOptions_SeqAIJCUSPARSE"
144 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A)
145 {
146   PetscErrorCode           ierr;
147   MatCUSPARSEStorageFormat format;
148   PetscBool                flg;
149 
150   PetscFunctionBegin;
151   ierr = PetscOptionsHead("SeqAIJCUSPARSE options");CHKERRQ(ierr);
152   ierr = PetscObjectOptionsBegin((PetscObject)A);
153   if (A->factortype==MAT_FACTOR_NONE) {
154     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
155                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
156     if (flg) {
157       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);
158     }
159   } else {
160     ierr = PetscOptionsEnum("-mat_cusparse_solve_storage_format","sets storage format of (seq)aijcusparse gpu matrices for TriSolve",
161                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
162     if (flg) {
163       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_SOLVE,format);CHKERRQ(ierr);
164     }
165   }
166   ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
167                           "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
168   if (flg) {
169     ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
170   }
171   ierr = PetscOptionsEnd();CHKERRQ(ierr);
172   PetscFunctionReturn(0);
173 
174 }
175 
176 #undef __FUNCT__
177 #define __FUNCT__ "MatILUFactorSymbolic_SeqAIJCUSPARSE"
178 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
179 {
180   PetscErrorCode ierr;
181 
182   PetscFunctionBegin;
183   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
184   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
185   PetscFunctionReturn(0);
186 }
187 
188 #undef __FUNCT__
189 #define __FUNCT__ "MatLUFactorSymbolic_SeqAIJCUSPARSE"
190 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
191 {
192   PetscErrorCode ierr;
193 
194   PetscFunctionBegin;
195   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
196   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
197   PetscFunctionReturn(0);
198 }
199 
200 #undef __FUNCT__
201 #define __FUNCT__ "MatICCFactorSymbolic_SeqAIJCUSPARSE"
202 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
203 {
204   PetscErrorCode ierr;
205 
206   PetscFunctionBegin;
207   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
208   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
209   PetscFunctionReturn(0);
210 }
211 
212 #undef __FUNCT__
213 #define __FUNCT__ "MatCholeskyFactorSymbolic_SeqAIJCUSPARSE"
214 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
215 {
216   PetscErrorCode ierr;
217 
218   PetscFunctionBegin;
219   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
220   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
221   PetscFunctionReturn(0);
222 }
223 
224 #undef __FUNCT__
225 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildILULowerTriMatrix"
226 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
227 {
228   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
229   PetscInt                          n = A->rmap->n;
230   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
231   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
232   cusparseStatus_t                  stat;
233   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
234   const MatScalar                   *aa = a->a,*v;
235   PetscInt                          *AiLo, *AjLo;
236   PetscScalar                       *AALo;
237   PetscInt                          i,nz, nzLower, offset, rowOffset;
238   PetscErrorCode                    ierr;
239 
240   PetscFunctionBegin;
241   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
242     try {
243       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
244       nzLower=n+ai[n]-ai[1];
245 
246       /* Allocate Space for the lower triangular matrix */
247       ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
248       ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUSP(ierr);
249       ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUSP(ierr);
250 
251       /* Fill the lower triangular matrix */
252       AiLo[0]  = (PetscInt) 0;
253       AiLo[n]  = nzLower;
254       AjLo[0]  = (PetscInt) 0;
255       AALo[0]  = (MatScalar) 1.0;
256       v        = aa;
257       vi       = aj;
258       offset   = 1;
259       rowOffset= 1;
260       for (i=1; i<n; i++) {
261         nz = ai[i+1] - ai[i];
262         /* additional 1 for the term on the diagonal */
263         AiLo[i]    = rowOffset;
264         rowOffset += nz+1;
265 
266         ierr = PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr);
267         ierr = PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
268 
269         offset      += nz;
270         AjLo[offset] = (PetscInt) i;
271         AALo[offset] = (MatScalar) 1.0;
272         offset      += 1;
273 
274         v  += nz;
275         vi += nz;
276       }
277 
278       /* allocate space for the triangular factor information */
279       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
280 
281       /* Create the matrix description */
282       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSP(stat);
283       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat);
284       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat);
285       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSP(stat);
286       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSP(stat);
287 
288       /* Create the solve analysis information */
289       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSP(stat);
290 
291       /* set the operation */
292       loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
293 
294       /* set the matrix */
295       loTriFactor->csrMat = new CsrMatrix;
296       loTriFactor->csrMat->num_rows = n;
297       loTriFactor->csrMat->num_cols = n;
298       loTriFactor->csrMat->num_entries = nzLower;
299 
300       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
301       loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
302 
303       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
304       loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
305 
306       loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
307       loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
308 
309       /* perform the solve analysis */
310       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
311                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
312                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
313                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSP(stat);
314 
315       /* assign the pointer. Is this really necessary? */
316       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
317 
318       ierr = cudaFreeHost(AiLo);CHKERRCUSP(ierr);
319       ierr = cudaFreeHost(AjLo);CHKERRCUSP(ierr);
320       ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr);
321     } catch(char *ex) {
322       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
323     }
324   }
325   PetscFunctionReturn(0);
326 }
327 
328 #undef __FUNCT__
329 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildILUUpperTriMatrix"
330 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
331 {
332   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
333   PetscInt                          n = A->rmap->n;
334   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
335   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
336   cusparseStatus_t                  stat;
337   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
338   const MatScalar                   *aa = a->a,*v;
339   PetscInt                          *AiUp, *AjUp;
340   PetscScalar                       *AAUp;
341   PetscInt                          i,nz, nzUpper, offset;
342   PetscErrorCode                    ierr;
343 
344   PetscFunctionBegin;
345   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
346     try {
347       /* next, figure out the number of nonzeros in the upper triangular matrix. */
348       nzUpper = adiag[0]-adiag[n];
349 
350       /* Allocate Space for the upper triangular matrix */
351       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
352       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr);
353       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr);
354 
355       /* Fill the upper triangular matrix */
356       AiUp[0]=(PetscInt) 0;
357       AiUp[n]=nzUpper;
358       offset = nzUpper;
359       for (i=n-1; i>=0; i--) {
360         v  = aa + adiag[i+1] + 1;
361         vi = aj + adiag[i+1] + 1;
362 
363         /* number of elements NOT on the diagonal */
364         nz = adiag[i] - adiag[i+1]-1;
365 
366         /* decrement the offset */
367         offset -= (nz+1);
368 
369         /* first, set the diagonal elements */
370         AjUp[offset] = (PetscInt) i;
371         AAUp[offset] = 1./v[nz];
372         AiUp[i]      = AiUp[i+1] - (nz+1);
373 
374         ierr = PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr);
375         ierr = PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
376       }
377 
378       /* allocate space for the triangular factor information */
379       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
380 
381       /* Create the matrix description */
382       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSP(stat);
383       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat);
384       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat);
385       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat);
386       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat);
387 
388       /* Create the solve analysis information */
389       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSP(stat);
390 
391       /* set the operation */
392       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
393 
394       /* set the matrix */
395       upTriFactor->csrMat = new CsrMatrix;
396       upTriFactor->csrMat->num_rows = n;
397       upTriFactor->csrMat->num_cols = n;
398       upTriFactor->csrMat->num_entries = nzUpper;
399 
400       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
401       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
402 
403       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
404       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
405 
406       upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
407       upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
408 
409       /* perform the solve analysis */
410       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
411                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
412                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
413                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSP(stat);
414 
415       /* assign the pointer. Is this really necessary? */
416       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
417 
418       ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr);
419       ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr);
420       ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr);
421     } catch(char *ex) {
422       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
423     }
424   }
425   PetscFunctionReturn(0);
426 }
427 
428 #undef __FUNCT__
429 #define __FUNCT__ "MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU"
430 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
431 {
432   PetscErrorCode               ierr;
433   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
434   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
435   IS                           isrow = a->row,iscol = a->icol;
436   PetscBool                    row_identity,col_identity;
437   const PetscInt               *r,*c;
438   PetscInt                     n = A->rmap->n;
439 
440   PetscFunctionBegin;
441   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
442   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
443 
444   cusparseTriFactors->workVector = new THRUSTARRAY;
445   cusparseTriFactors->workVector->resize(n);
446   cusparseTriFactors->nnz=a->nz;
447 
448   A->valid_GPU_matrix = PETSC_CUSP_BOTH;
449   /*lower triangular indices */
450   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
451   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
452   if (!row_identity) {
453     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
454     cusparseTriFactors->rpermIndices->assign(r, r+n);
455   }
456   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
457 
458   /*upper triangular indices */
459   ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
460   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
461   if (!col_identity) {
462     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
463     cusparseTriFactors->cpermIndices->assign(c, c+n);
464   }
465   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
466   PetscFunctionReturn(0);
467 }
468 
469 #undef __FUNCT__
470 #define __FUNCT__ "MatSeqAIJCUSPARSEBuildICCTriMatrices"
471 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
472 {
473   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
474   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
475   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
476   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
477   cusparseStatus_t                  stat;
478   PetscErrorCode                    ierr;
479   PetscInt                          *AiUp, *AjUp;
480   PetscScalar                       *AAUp;
481   PetscScalar                       *AALo;
482   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
483   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
484   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
485   const MatScalar                   *aa = b->a,*v;
486 
487   PetscFunctionBegin;
488   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
489     try {
490       /* Allocate Space for the upper triangular matrix */
491       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUSP(ierr);
492       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUSP(ierr);
493       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr);
494       ierr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUSP(ierr);
495 
496       /* Fill the upper triangular matrix */
497       AiUp[0]=(PetscInt) 0;
498       AiUp[n]=nzUpper;
499       offset = 0;
500       for (i=0; i<n; i++) {
501         /* set the pointers */
502         v  = aa + ai[i];
503         vj = aj + ai[i];
504         nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
505 
506         /* first, set the diagonal elements */
507         AjUp[offset] = (PetscInt) i;
508         AAUp[offset] = 1.0/v[nz];
509         AiUp[i]      = offset;
510         AALo[offset] = 1.0/v[nz];
511 
512         offset+=1;
513         if (nz>0) {
514           ierr = PetscMemcpy(&(AjUp[offset]), vj, nz*sizeof(PetscInt));CHKERRQ(ierr);
515           ierr = PetscMemcpy(&(AAUp[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
516           for (j=offset; j<offset+nz; j++) {
517             AAUp[j] = -AAUp[j];
518             AALo[j] = AAUp[j]/v[nz];
519           }
520           offset+=nz;
521         }
522       }
523 
524       /* allocate space for the triangular factor information */
525       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
526 
527       /* Create the matrix description */
528       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSP(stat);
529       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat);
530       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat);
531       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat);
532       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSP(stat);
533 
534       /* Create the solve analysis information */
535       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUSP(stat);
536 
537       /* set the operation */
538       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
539 
540       /* set the matrix */
541       upTriFactor->csrMat = new CsrMatrix;
542       upTriFactor->csrMat->num_rows = A->rmap->n;
543       upTriFactor->csrMat->num_cols = A->cmap->n;
544       upTriFactor->csrMat->num_entries = a->nz;
545 
546       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
547       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
548 
549       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
550       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
551 
552       upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
553       upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
554 
555       /* perform the solve analysis */
556       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
557                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
558                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
559                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUSP(stat);
560 
561       /* assign the pointer. Is this really necessary? */
562       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
563 
564       /* allocate space for the triangular factor information */
565       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
566 
567       /* Create the matrix description */
568       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSP(stat);
569       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat);
570       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSP(stat);
571       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSP(stat);
572       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSP(stat);
573 
574       /* Create the solve analysis information */
575       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUSP(stat);
576 
577       /* set the operation */
578       loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
579 
580       /* set the matrix */
581       loTriFactor->csrMat = new CsrMatrix;
582       loTriFactor->csrMat->num_rows = A->rmap->n;
583       loTriFactor->csrMat->num_cols = A->cmap->n;
584       loTriFactor->csrMat->num_entries = a->nz;
585 
586       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
587       loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
588 
589       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
590       loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
591 
592       loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
593       loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
594 
595       /* perform the solve analysis */
596       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
597                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
598                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
599                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUSP(stat);
600 
601       /* assign the pointer. Is this really necessary? */
602       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
603 
604       A->valid_GPU_matrix = PETSC_CUSP_BOTH;
605       ierr = cudaFreeHost(AiUp);CHKERRCUSP(ierr);
606       ierr = cudaFreeHost(AjUp);CHKERRCUSP(ierr);
607       ierr = cudaFreeHost(AAUp);CHKERRCUSP(ierr);
608       ierr = cudaFreeHost(AALo);CHKERRCUSP(ierr);
609     } catch(char *ex) {
610       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
611     }
612   }
613   PetscFunctionReturn(0);
614 }
615 
616 #undef __FUNCT__
617 #define __FUNCT__ "MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU"
618 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
619 {
620   PetscErrorCode               ierr;
621   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
622   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
623   IS                           ip = a->row;
624   const PetscInt               *rip;
625   PetscBool                    perm_identity;
626   PetscInt                     n = A->rmap->n;
627 
628   PetscFunctionBegin;
629   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
630   cusparseTriFactors->workVector = new THRUSTARRAY;
631   cusparseTriFactors->workVector->resize(n);
632   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
633 
634   /*lower triangular indices */
635   ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
636   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
637   if (!perm_identity) {
638     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
639     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
640     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
641     cusparseTriFactors->cpermIndices->assign(rip, rip+n);
642   }
643   ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
644   PetscFunctionReturn(0);
645 }
646 
647 #undef __FUNCT__
648 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJCUSPARSE"
649 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
650 {
651   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
652   IS             isrow = b->row,iscol = b->col;
653   PetscBool      row_identity,col_identity;
654   PetscErrorCode ierr;
655 
656   PetscFunctionBegin;
657   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
658   /* determine which version of MatSolve needs to be used. */
659   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
660   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
661   if (row_identity && col_identity) {
662     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
663     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
664   } else {
665     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
666     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
667   }
668 
669   /* get the triangular factors */
670   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
671   PetscFunctionReturn(0);
672 }
673 
674 #undef __FUNCT__
675 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqAIJCUSPARSE"
676 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
677 {
678   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
679   IS             ip = b->row;
680   PetscBool      perm_identity;
681   PetscErrorCode ierr;
682 
683   PetscFunctionBegin;
684   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
685 
686   /* determine which version of MatSolve needs to be used. */
687   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
688   if (perm_identity) {
689     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
690     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
691   } else {
692     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
693     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
694   }
695 
696   /* get the triangular factors */
697   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
698   PetscFunctionReturn(0);
699 }
700 
701 #undef __FUNCT__
702 #define __FUNCT__ "MatSeqAIJCUSPARSEAnalyzeTransposeForSolve"
703 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
704 {
705   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
706   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
707   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
708   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
709   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
710   cusparseStatus_t                  stat;
711   cusparseIndexBase_t               indexBase;
712   cusparseMatrixType_t              matrixType;
713   cusparseFillMode_t                fillMode;
714   cusparseDiagType_t                diagType;
715 
716   PetscFunctionBegin;
717 
718   /*********************************************/
719   /* Now the Transpose of the Lower Tri Factor */
720   /*********************************************/
721 
722   /* allocate space for the transpose of the lower triangular factor */
723   loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
724 
725   /* set the matrix descriptors of the lower triangular factor */
726   matrixType = cusparseGetMatType(loTriFactor->descr);
727   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
728   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
729     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
730   diagType = cusparseGetMatDiagType(loTriFactor->descr);
731 
732   /* Create the matrix description */
733   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSP(stat);
734   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSP(stat);
735   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSP(stat);
736   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSP(stat);
737   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSP(stat);
738 
739   /* Create the solve analysis information */
740   stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUSP(stat);
741 
742   /* set the operation */
743   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
744 
745   /* allocate GPU space for the CSC of the lower triangular factor*/
746   loTriFactorT->csrMat = new CsrMatrix;
747   loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows;
748   loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols;
749   loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
750   loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1);
751   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries);
752   loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries);
753 
754   /* compute the transpose of the lower triangular factor, i.e. the CSC */
755   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
756                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
757                           loTriFactor->csrMat->values->data().get(),
758                           loTriFactor->csrMat->row_offsets->data().get(),
759                           loTriFactor->csrMat->column_indices->data().get(),
760                           loTriFactorT->csrMat->values->data().get(),
761                           loTriFactorT->csrMat->column_indices->data().get(),
762                           loTriFactorT->csrMat->row_offsets->data().get(),
763                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat);
764 
765   /* perform the solve analysis on the transposed matrix */
766   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
767                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries,
768                            loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
769                            loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(),
770                            loTriFactorT->solveInfo);CHKERRCUSP(stat);
771 
772   /* assign the pointer. Is this really necessary? */
773   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
774 
775   /*********************************************/
776   /* Now the Transpose of the Upper Tri Factor */
777   /*********************************************/
778 
779   /* allocate space for the transpose of the upper triangular factor */
780   upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
781 
782   /* set the matrix descriptors of the upper triangular factor */
783   matrixType = cusparseGetMatType(upTriFactor->descr);
784   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
785   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
786     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
787   diagType = cusparseGetMatDiagType(upTriFactor->descr);
788 
789   /* Create the matrix description */
790   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSP(stat);
791   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSP(stat);
792   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSP(stat);
793   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSP(stat);
794   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSP(stat);
795 
796   /* Create the solve analysis information */
797   stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUSP(stat);
798 
799   /* set the operation */
800   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
801 
802   /* allocate GPU space for the CSC of the upper triangular factor*/
803   upTriFactorT->csrMat = new CsrMatrix;
804   upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows;
805   upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols;
806   upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
807   upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1);
808   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries);
809   upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries);
810 
811   /* compute the transpose of the upper triangular factor, i.e. the CSC */
812   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
813                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
814                           upTriFactor->csrMat->values->data().get(),
815                           upTriFactor->csrMat->row_offsets->data().get(),
816                           upTriFactor->csrMat->column_indices->data().get(),
817                           upTriFactorT->csrMat->values->data().get(),
818                           upTriFactorT->csrMat->column_indices->data().get(),
819                           upTriFactorT->csrMat->row_offsets->data().get(),
820                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat);
821 
822   /* perform the solve analysis on the transposed matrix */
823   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
824                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries,
825                            upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
826                            upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(),
827                            upTriFactorT->solveInfo);CHKERRCUSP(stat);
828 
829   /* assign the pointer. Is this really necessary? */
830   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
831   PetscFunctionReturn(0);
832 }
833 
834 #undef __FUNCT__
835 #define __FUNCT__ "MatSeqAIJCUSPARSEGenerateTransposeForMult"
836 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
837 {
838   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
839   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
840   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
841   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
842   cusparseStatus_t             stat;
843   cusparseIndexBase_t          indexBase;
844 
845   PetscFunctionBegin;
846 
847   /* allocate space for the triangular factor information */
848   matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
849   stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSP(stat);
850   indexBase = cusparseGetMatIndexBase(matstruct->descr);
851   stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSP(stat);
852   stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSP(stat);
853 
854   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
855     CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
856     CsrMatrix *matrixT= new CsrMatrix;
857     matrixT->num_rows = A->rmap->n;
858     matrixT->num_cols = A->cmap->n;
859     matrixT->num_entries = a->nz;
860     matrixT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
861     matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
862     matrixT->values = new THRUSTARRAY(a->nz);
863 
864     /* compute the transpose of the upper triangular factor, i.e. the CSC */
865     indexBase = cusparseGetMatIndexBase(matstruct->descr);
866     stat = cusparse_csr2csc(cusparsestruct->handle, matrix->num_rows,
867                             matrix->num_cols, matrix->num_entries,
868                             matrix->values->data().get(),
869                             matrix->row_offsets->data().get(),
870                             matrix->column_indices->data().get(),
871                             matrixT->values->data().get(),
872                             matrixT->column_indices->data().get(),
873                             matrixT->row_offsets->data().get(),
874                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat);
875 
876     /* assign the pointer */
877     matstructT->mat = matrixT;
878 
879   } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
880 
881     /* First convert HYB to CSR */
882     CsrMatrix *temp= new CsrMatrix;
883     temp->num_rows = A->rmap->n;
884     temp->num_cols = A->cmap->n;
885     temp->num_entries = a->nz;
886     temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
887     temp->column_indices = new THRUSTINTARRAY32(a->nz);
888     temp->values = new THRUSTARRAY(a->nz);
889 
890     stat = cusparse_hyb2csr(cusparsestruct->handle,
891                             matstruct->descr, (cusparseHybMat_t)matstruct->mat,
892                             temp->values->data().get(),
893                             temp->row_offsets->data().get(),
894                             temp->column_indices->data().get());CHKERRCUSP(stat);
895 
896     /* Next, convert CSR to CSC (i.e. the matrix transpose) */
897     CsrMatrix *tempT= new CsrMatrix;
898     tempT->num_rows = A->rmap->n;
899     tempT->num_cols = A->cmap->n;
900     tempT->num_entries = a->nz;
901     tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
902     tempT->column_indices = new THRUSTINTARRAY32(a->nz);
903     tempT->values = new THRUSTARRAY(a->nz);
904 
905     stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
906                             temp->num_cols, temp->num_entries,
907                             temp->values->data().get(),
908                             temp->row_offsets->data().get(),
909                             temp->column_indices->data().get(),
910                             tempT->values->data().get(),
911                             tempT->column_indices->data().get(),
912                             tempT->row_offsets->data().get(),
913                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSP(stat);
914 
915     /* Last, convert CSC to HYB */
916     cusparseHybMat_t hybMat;
917     stat = cusparseCreateHybMat(&hybMat);CHKERRCUSP(stat);
918     cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
919       CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
920     stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
921                             matstructT->descr, tempT->values->data().get(),
922                             tempT->row_offsets->data().get(),
923                             tempT->column_indices->data().get(),
924                             hybMat, 0, partition);CHKERRCUSP(stat);
925 
926     /* assign the pointer */
927     matstructT->mat = hybMat;
928 
929     /* delete temporaries */
930     if (tempT) {
931       if (tempT->values) delete (THRUSTARRAY*) tempT->values;
932       if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
933       if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
934       delete (CsrMatrix*) tempT;
935     }
936     if (temp) {
937       if (temp->values) delete (THRUSTARRAY*) temp->values;
938       if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
939       if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
940       delete (CsrMatrix*) temp;
941     }
942   }
943   /* assign the compressed row indices */
944   matstructT->cprowIndices = new THRUSTINTARRAY;
945 
946   /* assign the pointer */
947   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
948   PetscFunctionReturn(0);
949 }
950 
951 #undef __FUNCT__
952 #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE"
953 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
954 {
955   CUSPARRAY                         *xGPU, *bGPU;
956   cusparseStatus_t                  stat;
957   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
958   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
959   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
960   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
961   PetscErrorCode                    ierr;
962 
963   PetscFunctionBegin;
964   /* Analyze the matrix and create the transpose ... on the fly */
965   if (!loTriFactorT && !upTriFactorT) {
966     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
967     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
968     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
969   }
970 
971   /* Get the GPU pointers */
972   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
973   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
974 
975   /* First, reorder with the row permutation */
976   thrust::copy(thrust::make_permutation_iterator(bGPU->begin(), cusparseTriFactors->rpermIndices->begin()),
977                thrust::make_permutation_iterator(bGPU->end(), cusparseTriFactors->rpermIndices->end()),
978                xGPU->begin());
979 
980   /* First, solve U */
981   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
982                         upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr,
983                         upTriFactorT->csrMat->values->data().get(),
984                         upTriFactorT->csrMat->row_offsets->data().get(),
985                         upTriFactorT->csrMat->column_indices->data().get(),
986                         upTriFactorT->solveInfo,
987                         xGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat);
988 
989   /* Then, solve L */
990   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
991                         loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr,
992                         loTriFactorT->csrMat->values->data().get(),
993                         loTriFactorT->csrMat->row_offsets->data().get(),
994                         loTriFactorT->csrMat->column_indices->data().get(),
995                         loTriFactorT->solveInfo,
996                         tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat);
997 
998   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
999   thrust::copy(thrust::make_permutation_iterator(xGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1000                thrust::make_permutation_iterator(xGPU->end(), cusparseTriFactors->cpermIndices->end()),
1001                tempGPU->begin());
1002 
1003   /* Copy the temporary to the full solution. */
1004   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU->begin());
1005 
1006   /* restore */
1007   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
1008   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1009   ierr = WaitForGPU();CHKERRCUSP(ierr);
1010 
1011   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1012   PetscFunctionReturn(0);
1013 }
1014 
1015 #undef __FUNCT__
1016 #define __FUNCT__ "MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering"
1017 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1018 {
1019   CUSPARRAY                         *xGPU,*bGPU;
1020   cusparseStatus_t                  stat;
1021   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1022   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1023   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1024   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1025   PetscErrorCode                    ierr;
1026 
1027   PetscFunctionBegin;
1028   /* Analyze the matrix and create the transpose ... on the fly */
1029   if (!loTriFactorT && !upTriFactorT) {
1030     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1031     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1032     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1033   }
1034 
1035   /* Get the GPU pointers */
1036   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1037   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
1038 
1039   /* First, solve U */
1040   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1041                         upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr,
1042                         upTriFactorT->csrMat->values->data().get(),
1043                         upTriFactorT->csrMat->row_offsets->data().get(),
1044                         upTriFactorT->csrMat->column_indices->data().get(),
1045                         upTriFactorT->solveInfo,
1046                         bGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat);
1047 
1048   /* Then, solve L */
1049   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1050                         loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr,
1051                         loTriFactorT->csrMat->values->data().get(),
1052                         loTriFactorT->csrMat->row_offsets->data().get(),
1053                         loTriFactorT->csrMat->column_indices->data().get(),
1054                         loTriFactorT->solveInfo,
1055                         tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat);
1056 
1057   /* restore */
1058   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
1059   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1060   ierr = WaitForGPU();CHKERRCUSP(ierr);
1061   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1062   PetscFunctionReturn(0);
1063 }
1064 
1065 #undef __FUNCT__
1066 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE"
1067 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1068 {
1069   CUSPARRAY                         *xGPU,*bGPU;
1070   cusparseStatus_t                  stat;
1071   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1072   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1073   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1074   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1075   PetscErrorCode                    ierr;
1076 
1077   PetscFunctionBegin;
1078   /* Get the GPU pointers */
1079   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1080   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
1081 
1082   /* First, reorder with the row permutation */
1083   thrust::copy(thrust::make_permutation_iterator(bGPU->begin(), cusparseTriFactors->rpermIndices->begin()),
1084                thrust::make_permutation_iterator(bGPU->end(), cusparseTriFactors->rpermIndices->end()),
1085                xGPU->begin());
1086 
1087   /* Next, solve L */
1088   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1089                         loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr,
1090                         loTriFactor->csrMat->values->data().get(),
1091                         loTriFactor->csrMat->row_offsets->data().get(),
1092                         loTriFactor->csrMat->column_indices->data().get(),
1093                         loTriFactor->solveInfo,
1094                         xGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat);
1095 
1096   /* Then, solve U */
1097   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1098                         upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr,
1099                         upTriFactor->csrMat->values->data().get(),
1100                         upTriFactor->csrMat->row_offsets->data().get(),
1101                         upTriFactor->csrMat->column_indices->data().get(),
1102                         upTriFactor->solveInfo,
1103                         tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat);
1104 
1105   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1106   thrust::copy(thrust::make_permutation_iterator(xGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1107                thrust::make_permutation_iterator(xGPU->end(), cusparseTriFactors->cpermIndices->end()),
1108                tempGPU->begin());
1109 
1110   /* Copy the temporary to the full solution. */
1111   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU->begin());
1112 
1113   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
1114   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1115   ierr = WaitForGPU();CHKERRCUSP(ierr);
1116   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1117   PetscFunctionReturn(0);
1118 }
1119 
1120 #undef __FUNCT__
1121 #define __FUNCT__ "MatSolve_SeqAIJCUSPARSE_NaturalOrdering"
1122 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1123 {
1124   CUSPARRAY                         *xGPU,*bGPU;
1125   cusparseStatus_t                  stat;
1126   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1127   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1128   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1129   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1130   PetscErrorCode                    ierr;
1131 
1132   PetscFunctionBegin;
1133   /* Get the GPU pointers */
1134   ierr = VecCUSPGetArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1135   ierr = VecCUSPGetArrayRead(bb,&bGPU);CHKERRQ(ierr);
1136 
1137   /* First, solve L */
1138   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1139                         loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr,
1140                         loTriFactor->csrMat->values->data().get(),
1141                         loTriFactor->csrMat->row_offsets->data().get(),
1142                         loTriFactor->csrMat->column_indices->data().get(),
1143                         loTriFactor->solveInfo,
1144                         bGPU->data().get(), tempGPU->data().get());CHKERRCUSP(stat);
1145 
1146   /* Next, solve U */
1147   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1148                         upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr,
1149                         upTriFactor->csrMat->values->data().get(),
1150                         upTriFactor->csrMat->row_offsets->data().get(),
1151                         upTriFactor->csrMat->column_indices->data().get(),
1152                         upTriFactor->solveInfo,
1153                         tempGPU->data().get(), xGPU->data().get());CHKERRCUSP(stat);
1154 
1155   ierr = VecCUSPRestoreArrayRead(bb,&bGPU);CHKERRQ(ierr);
1156   ierr = VecCUSPRestoreArrayWrite(xx,&xGPU);CHKERRQ(ierr);
1157   ierr = WaitForGPU();CHKERRCUSP(ierr);
1158   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1159   PetscFunctionReturn(0);
1160 }
1161 
1162 #undef __FUNCT__
1163 #define __FUNCT__ "MatSeqAIJCUSPARSECopyToGPU"
1164 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1165 {
1166 
1167   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1168   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1169   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1170   PetscInt                     m = A->rmap->n,*ii,*ridx;
1171   PetscErrorCode               ierr;
1172   cusparseStatus_t             stat;
1173 
1174   PetscFunctionBegin;
1175   if (A->valid_GPU_matrix == PETSC_CUSP_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUSP_CPU) {
1176     ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1177     if (matstruct) {
1178       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Data structure should not be initialized.");
1179     }
1180     try {
1181       cusparsestruct->nonzerorow=0;
1182       for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0);
1183 
1184       if (a->compressedrow.use) {
1185         m    = a->compressedrow.nrows;
1186         ii   = a->compressedrow.i;
1187         ridx = a->compressedrow.rindex;
1188       } else {
1189         /* Forcing compressed row on the GPU ... only relevant for CSR storage */
1190         int k=0;
1191         ierr = PetscMalloc((cusparsestruct->nonzerorow+1)*sizeof(PetscInt), &ii);CHKERRQ(ierr);
1192         ierr = PetscMalloc((cusparsestruct->nonzerorow)*sizeof(PetscInt), &ridx);CHKERRQ(ierr);
1193         ii[0]=0;
1194         for (int j = 0; j<m; j++) {
1195           if ((a->i[j+1]-a->i[j])>0) {
1196             ii[k]  = a->i[j];
1197             ridx[k]= j;
1198             k++;
1199           }
1200         }
1201         ii[cusparsestruct->nonzerorow] = a->nz;
1202 
1203         m = cusparsestruct->nonzerorow;
1204       }
1205 
1206       /* allocate space for the triangular factor information */
1207       matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1208       stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSP(stat);
1209       stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSP(stat);
1210       stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSP(stat);
1211 
1212       /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1213       if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1214         /* set the matrix */
1215 	CsrMatrix *matrix= new CsrMatrix;
1216 	matrix->num_rows = A->rmap->n;
1217 	matrix->num_cols = A->cmap->n;
1218 	matrix->num_entries = a->nz;
1219 	matrix->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1220 	matrix->row_offsets->assign(ii, ii + A->rmap->n+1);
1221 
1222 	matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1223 	matrix->column_indices->assign(a->j, a->j+a->nz);
1224 
1225 	matrix->values = new THRUSTARRAY(a->nz);
1226 	matrix->values->assign(a->a, a->a+a->nz);
1227 
1228         /* assign the pointer */
1229         matstruct->mat = matrix;
1230 
1231       } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1232 
1233 	CsrMatrix *matrix= new CsrMatrix;
1234 	matrix->num_rows = A->rmap->n;
1235 	matrix->num_cols = A->cmap->n;
1236 	matrix->num_entries = a->nz;
1237 	matrix->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1238 	matrix->row_offsets->assign(ii, ii + A->rmap->n+1);
1239 
1240 	matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1241 	matrix->column_indices->assign(a->j, a->j+a->nz);
1242 
1243 	matrix->values = new THRUSTARRAY(a->nz);
1244 	matrix->values->assign(a->a, a->a+a->nz);
1245 
1246         cusparseHybMat_t hybMat;
1247         stat = cusparseCreateHybMat(&hybMat);CHKERRCUSP(stat);
1248         cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1249           CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1250         stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1251                                 matstruct->descr, matrix->values->data().get(),
1252                                 matrix->row_offsets->data().get(),
1253                                 matrix->column_indices->data().get(),
1254                                 hybMat, 0, partition);CHKERRCUSP(stat);
1255         /* assign the pointer */
1256         matstruct->mat = hybMat;
1257 
1258 	if (matrix) {
1259 	  if (matrix->values) delete (THRUSTARRAY*)matrix->values;
1260 	  if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices;
1261 	  if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets;
1262 	  delete (CsrMatrix*)matrix;
1263 	}
1264       }
1265 
1266       /* assign the compressed row indices */
1267       matstruct->cprowIndices = new THRUSTINTARRAY(m);
1268       matstruct->cprowIndices->assign(ridx,ridx+m);
1269 
1270       /* assign the pointer */
1271       cusparsestruct->mat = matstruct;
1272 
1273       if (!a->compressedrow.use) {
1274         ierr = PetscFree(ii);CHKERRQ(ierr);
1275         ierr = PetscFree(ridx);CHKERRQ(ierr);
1276       }
1277       cusparsestruct->workVector = new THRUSTARRAY;
1278       cusparsestruct->workVector->resize(m);
1279     } catch(char *ex) {
1280       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1281     }
1282     ierr = WaitForGPU();CHKERRCUSP(ierr);
1283 
1284     A->valid_GPU_matrix = PETSC_CUSP_BOTH;
1285 
1286     ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1287   }
1288   PetscFunctionReturn(0);
1289 }
1290 
1291 #undef __FUNCT__
1292 #define __FUNCT__ "MatGetVecs_SeqAIJCUSPARSE"
1293 static PetscErrorCode MatGetVecs_SeqAIJCUSPARSE(Mat mat, Vec *right, Vec *left)
1294 {
1295   PetscErrorCode ierr;
1296 
1297   PetscFunctionBegin;
1298   if (right) {
1299     ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr);
1300     ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
1301     ierr = VecSetBlockSize(*right,mat->rmap->bs);CHKERRQ(ierr);
1302     ierr = VecSetType(*right,VECSEQCUSP);CHKERRQ(ierr);
1303     ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr);
1304   }
1305   if (left) {
1306     ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr);
1307     ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
1308     ierr = VecSetBlockSize(*left,mat->rmap->bs);CHKERRQ(ierr);
1309     ierr = VecSetType(*left,VECSEQCUSP);CHKERRQ(ierr);
1310     ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr);
1311   }
1312   PetscFunctionReturn(0);
1313 }
1314 
1315 struct VecCUSPPlusEquals
1316 {
1317   template <typename Tuple>
1318   __host__ __device__
1319   void operator()(Tuple t)
1320   {
1321     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1322   }
1323 };
1324 
1325 #undef __FUNCT__
1326 #define __FUNCT__ "MatMult_SeqAIJCUSPARSE"
1327 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1328 {
1329   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1330   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1331   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1332   CUSPARRAY                    *xarray,*yarray;
1333   PetscErrorCode               ierr;
1334   cusparseStatus_t             stat;
1335 
1336   PetscFunctionBegin;
1337   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
1338      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
1339   ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1340   ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
1341   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1342     CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1343     stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1344                              mat->num_rows, mat->num_cols, mat->num_entries,
1345                              &ALPHA, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(),
1346                              mat->column_indices->data().get(), xarray->data().get(), &BETA,
1347                              yarray->data().get());CHKERRCUSP(stat);
1348   } else {
1349     cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1350     stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1351                              &ALPHA, matstruct->descr, hybMat,
1352                              xarray->data().get(), &BETA,
1353                              yarray->data().get());CHKERRCUSP(stat);
1354   }
1355   ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1356   ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
1357   if (!cusparsestruct->stream) {
1358     ierr = WaitForGPU();CHKERRCUSP(ierr);
1359   }
1360   ierr = PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr);
1361   PetscFunctionReturn(0);
1362 }
1363 
1364 #undef __FUNCT__
1365 #define __FUNCT__ "MatMultTranspose_SeqAIJCUSPARSE"
1366 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1367 {
1368   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1369   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1370   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1371   CUSPARRAY                    *xarray,*yarray;
1372   PetscErrorCode               ierr;
1373   cusparseStatus_t             stat;
1374 
1375   PetscFunctionBegin;
1376   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
1377      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
1378   if (!matstructT) {
1379     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1380     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1381   }
1382   ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1383   ierr = VecCUSPGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
1384 
1385   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1386     CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1387     stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1388                              mat->num_rows, mat->num_cols,
1389                              mat->num_entries, &ALPHA, matstructT->descr,
1390                              mat->values->data().get(), mat->row_offsets->data().get(),
1391                              mat->column_indices->data().get(), xarray->data().get(), &BETA,
1392                              yarray->data().get());CHKERRCUSP(stat);
1393   } else {
1394     cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1395     stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1396                              &ALPHA, matstructT->descr, hybMat,
1397                              xarray->data().get(), &BETA,
1398                              yarray->data().get());CHKERRCUSP(stat);
1399   }
1400   ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1401   ierr = VecCUSPRestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
1402   if (!cusparsestruct->stream) {
1403     ierr = WaitForGPU();CHKERRCUSP(ierr);
1404   }
1405   ierr = PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr);
1406   PetscFunctionReturn(0);
1407 }
1408 
1409 
1410 #undef __FUNCT__
1411 #define __FUNCT__ "MatMultAdd_SeqAIJCUSPARSE"
1412 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1413 {
1414   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1415   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1416   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1417   CUSPARRAY                    *xarray,*yarray,*zarray;
1418   PetscErrorCode               ierr;
1419   cusparseStatus_t             stat;
1420 
1421   PetscFunctionBegin;
1422   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
1423      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
1424   try {
1425     ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr);
1426     ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1427     ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr);
1428     ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
1429 
1430     /* multiply add */
1431     if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1432       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1433       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1434                                mat->num_rows, mat->num_cols,
1435                                mat->num_entries, &ALPHA, matstruct->descr,
1436                                mat->values->data().get(), mat->row_offsets->data().get(),
1437                                mat->column_indices->data().get(), xarray->data().get(), &BETA,
1438                                cusparsestruct->workVector->data().get());CHKERRCUSP(stat);
1439     } else {
1440       cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1441       stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1442 			       &ALPHA, matstruct->descr, hybMat,
1443 			       xarray->data().get(), &BETA,
1444 			       cusparsestruct->workVector->data().get());CHKERRCUSP(stat);
1445     }
1446 
1447     /* scatter the data from the temporary into the full vector with a += operation */
1448     thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstruct->cprowIndices->begin()))),
1449 		     thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(),
1450 		     VecCUSPPlusEquals());
1451 
1452     ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1453     ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr);
1454     ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
1455 
1456   } catch(char *ex) {
1457     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1458   }
1459   ierr = WaitForGPU();CHKERRCUSP(ierr);
1460   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1461   PetscFunctionReturn(0);
1462 }
1463 
1464 #undef __FUNCT__
1465 #define __FUNCT__ "MatMultTransposeAdd_SeqAIJCUSPARSE"
1466 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1467 {
1468   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1469   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1470   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1471   CUSPARRAY                    *xarray,*yarray,*zarray;
1472   PetscErrorCode               ierr;
1473   cusparseStatus_t             stat;
1474 
1475   PetscFunctionBegin;
1476   /* The line below should not be necessary as it has been moved to MatAssemblyEnd_SeqAIJCUSPARSE
1477      ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); */
1478   if (!matstructT) {
1479     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1480     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1481   }
1482 
1483   try {
1484     ierr = VecCopy_SeqCUSP(yy,zz);CHKERRQ(ierr);
1485     ierr = VecCUSPGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1486     ierr = VecCUSPGetArrayRead(yy,&yarray);CHKERRQ(ierr);
1487     ierr = VecCUSPGetArrayWrite(zz,&zarray);CHKERRQ(ierr);
1488 
1489     /* multiply add with matrix transpose */
1490     if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1491       CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1492       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1493                                mat->num_rows, mat->num_cols, mat->num_entries, &ALPHA, matstructT->descr,
1494                                mat->values->data().get(), mat->row_offsets->data().get(),
1495                                mat->column_indices->data().get(), xarray->data().get(), &BETA,
1496                                cusparsestruct->workVector->data().get());CHKERRCUSP(stat);
1497     } else {
1498       cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1499       stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1500 			       &ALPHA, matstructT->descr, hybMat,
1501 			       xarray->data().get(), &BETA,
1502 			       cusparsestruct->workVector->data().get());CHKERRCUSP(stat);
1503     }
1504 
1505     /* scatter the data from the temporary into the full vector with a += operation */
1506     thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstructT->cprowIndices->begin()))),
1507 		     thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zarray->begin(), matstructT->cprowIndices->begin()))) + cusparsestruct->workVector->size(),
1508 		     VecCUSPPlusEquals());
1509 
1510     ierr = VecCUSPRestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1511     ierr = VecCUSPRestoreArrayRead(yy,&yarray);CHKERRQ(ierr);
1512     ierr = VecCUSPRestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);
1513 
1514   } catch(char *ex) {
1515     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1516   }
1517   ierr = WaitForGPU();CHKERRCUSP(ierr);
1518   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1519   PetscFunctionReturn(0);
1520 }
1521 
1522 #undef __FUNCT__
1523 #define __FUNCT__ "MatAssemblyEnd_SeqAIJCUSPARSE"
1524 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
1525 {
1526   PetscErrorCode ierr;
1527 
1528   PetscFunctionBegin;
1529   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
1530   if (A->factortype==MAT_FACTOR_NONE) {
1531     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1532   }
1533   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
1534   A->ops->mult             = MatMult_SeqAIJCUSPARSE;
1535   A->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1536   A->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1537   A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1538   PetscFunctionReturn(0);
1539 }
1540 
1541 /* --------------------------------------------------------------------------------*/
1542 #undef __FUNCT__
1543 #define __FUNCT__ "MatCreateSeqAIJCUSPARSE"
1544 /*@
1545    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
1546    (the default parallel PETSc format). This matrix will ultimately pushed down
1547    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
1548    assembly performance the user should preallocate the matrix storage by setting
1549    the parameter nz (or the array nnz).  By setting these parameters accurately,
1550    performance during matrix assembly can be increased by more than a factor of 50.
1551 
1552    Collective on MPI_Comm
1553 
1554    Input Parameters:
1555 +  comm - MPI communicator, set to PETSC_COMM_SELF
1556 .  m - number of rows
1557 .  n - number of columns
1558 .  nz - number of nonzeros per row (same for all rows)
1559 -  nnz - array containing the number of nonzeros in the various rows
1560          (possibly different for each row) or NULL
1561 
1562    Output Parameter:
1563 .  A - the matrix
1564 
1565    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1566    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1567    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
1568 
1569    Notes:
1570    If nnz is given then nz is ignored
1571 
1572    The AIJ format (also called the Yale sparse matrix format or
1573    compressed row storage), is fully compatible with standard Fortran 77
1574    storage.  That is, the stored row and column indices can begin at
1575    either one (as in Fortran) or zero.  See the users' manual for details.
1576 
1577    Specify the preallocated storage with either nz or nnz (not both).
1578    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1579    allocation.  For large problems you MUST preallocate memory or you
1580    will get TERRIBLE performance, see the users' manual chapter on matrices.
1581 
1582    By default, this format uses inodes (identical nodes) when possible, to
1583    improve numerical efficiency of matrix-vector products and solves. We
1584    search for consecutive rows with the same nonzero structure, thereby
1585    reusing matrix information to achieve increased efficiency.
1586 
1587    Level: intermediate
1588 
1589 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
1590 @*/
1591 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1592 {
1593   PetscErrorCode ierr;
1594 
1595   PetscFunctionBegin;
1596   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1597   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
1598   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1599   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
1600   PetscFunctionReturn(0);
1601 }
1602 
1603 #undef __FUNCT__
1604 #define __FUNCT__ "MatDestroy_SeqAIJCUSPARSE"
1605 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
1606 {
1607   PetscErrorCode     ierr;
1608   cusparseStatus_t   stat;
1609 
1610   PetscFunctionBegin;
1611   if (A->factortype==MAT_FACTOR_NONE) {
1612     try {
1613       if (A->valid_GPU_matrix != PETSC_CUSP_UNALLOCATED) {
1614         Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1615         Mat_SeqAIJCUSPARSEMultStruct *matstruct  = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1616         Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1617 
1618         /* delete all the members in each of the data structures */
1619         if (matstruct) {
1620           if (matstruct->descr) { stat = cusparseDestroyMatDescr(matstruct->descr);CHKERRCUSP(stat); }
1621           if (matstruct->mat) {
1622             if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1623               cusparseHybMat_t hybMat = (cusparseHybMat_t) matstruct->mat;
1624               stat = cusparseDestroyHybMat(hybMat);CHKERRCUSP(stat);
1625             } else {
1626 	      CsrMatrix* mat = (CsrMatrix*)matstruct->mat;
1627 	      if (mat->values) delete (THRUSTARRAY*)mat->values;
1628 	      if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1629 	      if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1630               delete (CsrMatrix*)matstruct->mat;
1631             }
1632           }
1633           if (matstruct->cprowIndices) delete (THRUSTINTARRAY*)matstruct->cprowIndices;
1634           if (matstruct) delete (Mat_SeqAIJCUSPARSEMultStruct*)matstruct;
1635         }
1636         if (matstructT) {
1637           if (matstructT->descr) { stat = cusparseDestroyMatDescr(matstructT->descr);CHKERRCUSP(stat); }
1638           if (matstructT->mat) {
1639             if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1640               cusparseHybMat_t hybMat = (cusparseHybMat_t) matstructT->mat;
1641               stat = cusparseDestroyHybMat(hybMat);CHKERRCUSP(stat);
1642             } else {
1643 	      CsrMatrix* matT = (CsrMatrix*)matstructT->mat;
1644 	      if (matT->values) delete (THRUSTARRAY*)matT->values;
1645 	      if (matT->column_indices) delete (THRUSTINTARRAY32*)matT->column_indices;
1646 	      if (matT->row_offsets) delete (THRUSTINTARRAY32*)matT->row_offsets;
1647               delete (CsrMatrix*)matstructT->mat;
1648             }
1649           }
1650           if (matstructT->cprowIndices) delete (THRUSTINTARRAY*)matstructT->cprowIndices;
1651           if (matstructT) delete (Mat_SeqAIJCUSPARSEMultStruct*)matstructT;
1652         }
1653         if (cusparsestruct->workVector) delete (THRUSTARRAY*)cusparsestruct->workVector;
1654         if (cusparsestruct->handle) { stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSP(stat); }
1655         if (cusparsestruct) delete (Mat_SeqAIJCUSPARSE*)cusparsestruct;
1656         A->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED;
1657       } else {
1658         Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1659         if (cusparsestruct->handle) { stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSP(stat); }
1660         if (cusparsestruct) delete (Mat_SeqAIJCUSPARSE*)cusparsestruct;
1661       }
1662     } catch(char *ex) {
1663       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1664     }
1665   } else {
1666     /* The triangular factors */
1667     try {
1668       Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1669       Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1670       Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1671       Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1672       Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1673 
1674       /* delete all the members in each of the data structures */
1675       if (loTriFactor) {
1676         if (loTriFactor->descr) { stat = cusparseDestroyMatDescr(loTriFactor->descr);CHKERRCUSP(stat); }
1677         if (loTriFactor->solveInfo) { stat = cusparseDestroySolveAnalysisInfo(loTriFactor->solveInfo);CHKERRCUSP(stat); }
1678         if (loTriFactor->csrMat) {
1679 	  if (loTriFactor->csrMat->values) delete (THRUSTARRAY*)loTriFactor->csrMat->values;
1680 	  if (loTriFactor->csrMat->column_indices) delete (THRUSTINTARRAY32*)loTriFactor->csrMat->column_indices;
1681 	  if (loTriFactor->csrMat->row_offsets) delete (THRUSTINTARRAY32*)loTriFactor->csrMat->row_offsets;
1682 	  delete (CsrMatrix*)loTriFactor->csrMat;
1683 	}
1684         if (loTriFactor) delete (Mat_SeqAIJCUSPARSETriFactorStruct*)loTriFactor;
1685       }
1686 
1687       if (upTriFactor) {
1688         if (upTriFactor->descr) { stat = cusparseDestroyMatDescr(upTriFactor->descr);CHKERRCUSP(stat); }
1689         if (upTriFactor->solveInfo) { stat = cusparseDestroySolveAnalysisInfo(upTriFactor->solveInfo);CHKERRCUSP(stat); }
1690         if (upTriFactor->csrMat) {
1691 	  if (upTriFactor->csrMat->values) delete (THRUSTARRAY*)upTriFactor->csrMat->values;
1692 	  if (upTriFactor->csrMat->column_indices) delete (THRUSTINTARRAY32*)upTriFactor->csrMat->column_indices;
1693 	  if (upTriFactor->csrMat->row_offsets) delete (THRUSTINTARRAY32*)upTriFactor->csrMat->row_offsets;
1694 	  delete (CsrMatrix*)upTriFactor->csrMat;
1695 	}
1696         if (upTriFactor) delete (Mat_SeqAIJCUSPARSETriFactorStruct*)upTriFactor;
1697       }
1698 
1699       if (loTriFactorT) {
1700         if (loTriFactorT->descr) { stat = cusparseDestroyMatDescr(loTriFactorT->descr);CHKERRCUSP(stat); }
1701         if (loTriFactorT->solveInfo) { stat = cusparseDestroySolveAnalysisInfo(loTriFactorT->solveInfo);CHKERRCUSP(stat); }
1702         if (loTriFactorT->csrMat) {
1703 	  if (loTriFactorT->csrMat->values) delete (THRUSTARRAY*)loTriFactorT->csrMat->values;
1704 	  if (loTriFactorT->csrMat->column_indices) delete (THRUSTINTARRAY32*)loTriFactorT->csrMat->column_indices;
1705 	  if (loTriFactorT->csrMat->row_offsets) delete (THRUSTINTARRAY32*)loTriFactorT->csrMat->row_offsets;
1706 	  delete (CsrMatrix*)loTriFactorT->csrMat;
1707 	}
1708         if (loTriFactorT) delete (Mat_SeqAIJCUSPARSETriFactorStruct*)loTriFactorT;
1709       }
1710 
1711       if (upTriFactorT) {
1712         if (upTriFactorT->descr) { stat = cusparseDestroyMatDescr(upTriFactorT->descr);CHKERRCUSP(stat); }
1713         if (upTriFactorT->solveInfo) { stat = cusparseDestroySolveAnalysisInfo(upTriFactorT->solveInfo);CHKERRCUSP(stat); }
1714         if (upTriFactorT->csrMat) {
1715 	  if (upTriFactorT->csrMat->values) delete (THRUSTARRAY*)upTriFactorT->csrMat->values;
1716 	  if (upTriFactorT->csrMat->column_indices) delete (THRUSTINTARRAY32*)upTriFactorT->csrMat->column_indices;
1717 	  if (upTriFactorT->csrMat->row_offsets) delete (THRUSTINTARRAY32*)upTriFactorT->csrMat->row_offsets;
1718 	  delete (CsrMatrix*)upTriFactorT->csrMat;
1719 	}
1720         if (upTriFactorT) delete (Mat_SeqAIJCUSPARSETriFactorStruct*)upTriFactorT;
1721       }
1722       if (cusparseTriFactors->rpermIndices) delete (THRUSTINTARRAY*)cusparseTriFactors->rpermIndices;
1723       if (cusparseTriFactors->cpermIndices) delete (THRUSTINTARRAY*)cusparseTriFactors->cpermIndices;
1724       if (cusparseTriFactors->workVector) delete (THRUSTARRAY*)cusparseTriFactors->workVector;
1725       if (cusparseTriFactors->handle) { stat = cusparseDestroy(cusparseTriFactors->handle);CHKERRCUSP(stat); }
1726       if (cusparseTriFactors) delete (Mat_SeqAIJCUSPARSETriFactors*)cusparseTriFactors;
1727     } catch(char *ex) {
1728       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1729     }
1730   }
1731   /*this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */
1732   A->spptr = 0;
1733 
1734   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
1735   PetscFunctionReturn(0);
1736 }
1737 
1738 #undef __FUNCT__
1739 #define __FUNCT__ "MatCreate_SeqAIJCUSPARSE"
1740 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
1741 {
1742   PetscErrorCode ierr;
1743   cusparseStatus_t stat;
1744   cusparseHandle_t handle=0;
1745 
1746   PetscFunctionBegin;
1747   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
1748   if (B->factortype==MAT_FACTOR_NONE) {
1749     /* you cannot check the inode.use flag here since the matrix was just created.
1750        now build a GPU matrix data structure */
1751     B->spptr = new Mat_SeqAIJCUSPARSE;
1752     ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat          = 0;
1753     ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0;
1754     ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector   = 0;
1755     ((Mat_SeqAIJCUSPARSE*)B->spptr)->format       = MAT_CUSPARSE_CSR;
1756     ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream       = 0;
1757     ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle       = 0;
1758     stat = cusparseCreate(&handle);CHKERRCUSP(stat);
1759     ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle       = handle;
1760     ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream       = 0;
1761   } else {
1762     /* NEXT, set the pointers to the triangular factors */
1763     B->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1764     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr          = 0;
1765     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr          = 0;
1766     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0;
1767     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0;
1768     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices            = 0;
1769     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices            = 0;
1770     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector              = 0;
1771     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->format                  = cusparseMatSolveStorageFormat;
1772     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle                  = 0;
1773     stat = cusparseCreate(&handle);CHKERRCUSP(stat);
1774     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle                  = handle;
1775     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz                     = 0;
1776   }
1777 
1778   /* Here we overload MatGetFactor_petsc_C which enables -mat_type aijcusparse to use the
1779      default cusparse tri solve. Note the difference with the implementation in
1780      MatCreate_SeqAIJCUSP in ../seqcusp/aijcusp.cu */
1781   ierr = PetscObjectComposeFunction((PetscObject)B,"MatGetFactor_petsc_C",MatGetFactor_seqaij_cusparse);CHKERRQ(ierr);
1782 
1783   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
1784   B->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
1785   B->ops->getvecs          = MatGetVecs_SeqAIJCUSPARSE;
1786   B->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
1787   B->ops->mult             = MatMult_SeqAIJCUSPARSE;
1788   B->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1789   B->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1790   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1791 
1792   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1793 
1794   B->valid_GPU_matrix = PETSC_CUSP_UNALLOCATED;
1795 
1796   ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
1797   PetscFunctionReturn(0);
1798 }
1799 
1800 /*M
1801    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
1802 
1803    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
1804    CSR, ELL, or Hybrid format. All matrix calculations are performed on Nvidia GPUs using
1805    the CUSPARSE library.
1806 
1807    Options Database Keys:
1808 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
1809 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1810 .  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1811 -  -mat_cusparse_solve_storage_format csr - sets the storage format matrices (for factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1812 
1813   Level: beginner
1814 
1815 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
1816 M*/
1817