xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 2f6eced2a19e978d64f27de66fbc6c26cc5c7934)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 
7 #include <petscconf.h>
8 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
9 #include <../src/mat/impls/sbaij/seq/sbaij.h>
10 #include <../src/vec/vec/impls/dvecimpl.h>
11 #include <petsc/private/vecimpl.h>
12 #undef VecType
13 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
14 
15 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
16 
17 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
18 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
19 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
20 
21 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
22 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
23 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
24 
25 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
26 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
27 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
28 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
29 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
30 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
31 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
32 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
33 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
34 
35 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
36 static PetscErrorCode Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
37 static PetscErrorCode Mat_SeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
38 static PetscErrorCode Mat_SeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
39 static PetscErrorCode Mat_SeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
40 
41 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
42 {
43   cusparseStatus_t   stat;
44   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
45 
46   PetscFunctionBegin;
47   cusparsestruct->stream = stream;
48   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUDA(stat);
49   PetscFunctionReturn(0);
50 }
51 
52 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
53 {
54   cusparseStatus_t   stat;
55   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
56 
57   PetscFunctionBegin;
58   if (cusparsestruct->handle) {
59     stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUDA(stat);
60   }
61   cusparsestruct->handle = handle;
62   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
63   PetscFunctionReturn(0);
64 }
65 
66 PetscErrorCode MatCUSPARSEClearHandle(Mat A)
67 {
68   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
69   PetscFunctionBegin;
70   if (cusparsestruct->handle)
71     cusparsestruct->handle = 0;
72   PetscFunctionReturn(0);
73 }
74 
75 PetscErrorCode MatFactorGetSolverPackage_seqaij_cusparse(Mat A,const MatSolverPackage *type)
76 {
77   PetscFunctionBegin;
78   *type = MATSOLVERCUSPARSE;
79   PetscFunctionReturn(0);
80 }
81 
82 /*MC
83   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
84   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
85   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
86   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
87   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
88   algorithms are not recommended. This class does NOT support direct solver operations.
89 
90   Level: beginner
91 
92 .seealso: PCFactorSetMatSolverPackage(), MatSolverPackage, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
93 M*/
94 
95 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
96 {
97   PetscErrorCode ierr;
98   PetscInt       n = A->rmap->n;
99 
100   PetscFunctionBegin;
101   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
102   (*B)->factortype = ftype;
103   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
104   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
105 
106   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
107     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
108     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
109     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
110   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
111     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
112     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
113   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
114 
115   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
116   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverPackage_C",MatFactorGetSolverPackage_seqaij_cusparse);CHKERRQ(ierr);
117   PetscFunctionReturn(0);
118 }
119 
120 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
121 {
122   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
123 
124   PetscFunctionBegin;
125 #if CUDA_VERSION>=4020
126   switch (op) {
127   case MAT_CUSPARSE_MULT:
128     cusparsestruct->format = format;
129     break;
130   case MAT_CUSPARSE_ALL:
131     cusparsestruct->format = format;
132     break;
133   default:
134     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
135   }
136 #else
137   if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format require CUDA 4.2 or later.");
138 #endif
139   PetscFunctionReturn(0);
140 }
141 
142 /*@
143    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
144    operation. Only the MatMult operation can use different GPU storage formats
145    for MPIAIJCUSPARSE matrices.
146    Not Collective
147 
148    Input Parameters:
149 +  A - Matrix of type SEQAIJCUSPARSE
150 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
151 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
152 
153    Output Parameter:
154 
155    Level: intermediate
156 
157 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
158 @*/
159 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
160 {
161   PetscErrorCode ierr;
162 
163   PetscFunctionBegin;
164   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
165   ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
166   PetscFunctionReturn(0);
167 }
168 
169 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
170 {
171   PetscErrorCode           ierr;
172   MatCUSPARSEStorageFormat format;
173   PetscBool                flg;
174   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
175 
176   PetscFunctionBegin;
177   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
178   ierr = PetscObjectOptionsBegin((PetscObject)A);
179   if (A->factortype==MAT_FACTOR_NONE) {
180     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
181                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
182     if (flg) {
183       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);
184     }
185   }
186   ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
187                           "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
188   if (flg) {
189     ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
190   }
191   ierr = PetscOptionsEnd();CHKERRQ(ierr);
192   PetscFunctionReturn(0);
193 
194 }
195 
196 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
197 {
198   PetscErrorCode ierr;
199 
200   PetscFunctionBegin;
201   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
202   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
203   PetscFunctionReturn(0);
204 }
205 
206 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
207 {
208   PetscErrorCode ierr;
209 
210   PetscFunctionBegin;
211   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
212   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
213   PetscFunctionReturn(0);
214 }
215 
216 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
217 {
218   PetscErrorCode ierr;
219 
220   PetscFunctionBegin;
221   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
222   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
223   PetscFunctionReturn(0);
224 }
225 
226 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
227 {
228   PetscErrorCode ierr;
229 
230   PetscFunctionBegin;
231   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
232   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
233   PetscFunctionReturn(0);
234 }
235 
236 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
237 {
238   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
239   PetscInt                          n = A->rmap->n;
240   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
241   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
242   cusparseStatus_t                  stat;
243   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
244   const MatScalar                   *aa = a->a,*v;
245   PetscInt                          *AiLo, *AjLo;
246   PetscScalar                       *AALo;
247   PetscInt                          i,nz, nzLower, offset, rowOffset;
248   PetscErrorCode                    ierr;
249 
250   PetscFunctionBegin;
251   if (A->valid_GPU_matrix == PETSC_CUDA_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUDA_CPU) {
252     try {
253       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
254       nzLower=n+ai[n]-ai[1];
255 
256       /* Allocate Space for the lower triangular matrix */
257       ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
258       ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(ierr);
259       ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(ierr);
260 
261       /* Fill the lower triangular matrix */
262       AiLo[0]  = (PetscInt) 0;
263       AiLo[n]  = nzLower;
264       AjLo[0]  = (PetscInt) 0;
265       AALo[0]  = (MatScalar) 1.0;
266       v        = aa;
267       vi       = aj;
268       offset   = 1;
269       rowOffset= 1;
270       for (i=1; i<n; i++) {
271         nz = ai[i+1] - ai[i];
272         /* additional 1 for the term on the diagonal */
273         AiLo[i]    = rowOffset;
274         rowOffset += nz+1;
275 
276         ierr = PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr);
277         ierr = PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
278 
279         offset      += nz;
280         AjLo[offset] = (PetscInt) i;
281         AALo[offset] = (MatScalar) 1.0;
282         offset      += 1;
283 
284         v  += nz;
285         vi += nz;
286       }
287 
288       /* allocate space for the triangular factor information */
289       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
290 
291       /* Create the matrix description */
292       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat);
293       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
294       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
295       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUDA(stat);
296       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat);
297 
298       /* Create the solve analysis information */
299       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat);
300 
301       /* set the operation */
302       loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
303 
304       /* set the matrix */
305       loTriFactor->csrMat = new CsrMatrix;
306       loTriFactor->csrMat->num_rows = n;
307       loTriFactor->csrMat->num_cols = n;
308       loTriFactor->csrMat->num_entries = nzLower;
309 
310       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
311       loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
312 
313       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
314       loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
315 
316       loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
317       loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
318 
319       /* perform the solve analysis */
320       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
321                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
322                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
323                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat);
324 
325       /* assign the pointer. Is this really necessary? */
326       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
327 
328       ierr = cudaFreeHost(AiLo);CHKERRCUDA(ierr);
329       ierr = cudaFreeHost(AjLo);CHKERRCUDA(ierr);
330       ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr);
331     } catch(char *ex) {
332       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
333     }
334   }
335   PetscFunctionReturn(0);
336 }
337 
338 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
339 {
340   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
341   PetscInt                          n = A->rmap->n;
342   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
343   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
344   cusparseStatus_t                  stat;
345   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
346   const MatScalar                   *aa = a->a,*v;
347   PetscInt                          *AiUp, *AjUp;
348   PetscScalar                       *AAUp;
349   PetscInt                          i,nz, nzUpper, offset;
350   PetscErrorCode                    ierr;
351 
352   PetscFunctionBegin;
353   if (A->valid_GPU_matrix == PETSC_CUDA_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUDA_CPU) {
354     try {
355       /* next, figure out the number of nonzeros in the upper triangular matrix. */
356       nzUpper = adiag[0]-adiag[n];
357 
358       /* Allocate Space for the upper triangular matrix */
359       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
360       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr);
361       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
362 
363       /* Fill the upper triangular matrix */
364       AiUp[0]=(PetscInt) 0;
365       AiUp[n]=nzUpper;
366       offset = nzUpper;
367       for (i=n-1; i>=0; i--) {
368         v  = aa + adiag[i+1] + 1;
369         vi = aj + adiag[i+1] + 1;
370 
371         /* number of elements NOT on the diagonal */
372         nz = adiag[i] - adiag[i+1]-1;
373 
374         /* decrement the offset */
375         offset -= (nz+1);
376 
377         /* first, set the diagonal elements */
378         AjUp[offset] = (PetscInt) i;
379         AAUp[offset] = (MatScalar)1./v[nz];
380         AiUp[i]      = AiUp[i+1] - (nz+1);
381 
382         ierr = PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr);
383         ierr = PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
384       }
385 
386       /* allocate space for the triangular factor information */
387       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
388 
389       /* Create the matrix description */
390       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat);
391       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
392       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
393       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
394       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat);
395 
396       /* Create the solve analysis information */
397       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat);
398 
399       /* set the operation */
400       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
401 
402       /* set the matrix */
403       upTriFactor->csrMat = new CsrMatrix;
404       upTriFactor->csrMat->num_rows = n;
405       upTriFactor->csrMat->num_cols = n;
406       upTriFactor->csrMat->num_entries = nzUpper;
407 
408       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
409       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
410 
411       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
412       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
413 
414       upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
415       upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
416 
417       /* perform the solve analysis */
418       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
419                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
420                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
421                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat);
422 
423       /* assign the pointer. Is this really necessary? */
424       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
425 
426       ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr);
427       ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr);
428       ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr);
429     } catch(char *ex) {
430       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
431     }
432   }
433   PetscFunctionReturn(0);
434 }
435 
436 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
437 {
438   PetscErrorCode               ierr;
439   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
440   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
441   IS                           isrow = a->row,iscol = a->icol;
442   PetscBool                    row_identity,col_identity;
443   const PetscInt               *r,*c;
444   PetscInt                     n = A->rmap->n;
445 
446   PetscFunctionBegin;
447   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
448   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
449 
450   cusparseTriFactors->workVector = new THRUSTARRAY;
451   cusparseTriFactors->workVector->resize(n);
452   cusparseTriFactors->nnz=a->nz;
453 
454   A->valid_GPU_matrix = PETSC_CUDA_BOTH;
455   /*lower triangular indices */
456   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
457   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
458   if (!row_identity) {
459     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
460     cusparseTriFactors->rpermIndices->assign(r, r+n);
461   }
462   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
463 
464   /*upper triangular indices */
465   ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
466   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
467   if (!col_identity) {
468     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
469     cusparseTriFactors->cpermIndices->assign(c, c+n);
470   }
471   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
472   PetscFunctionReturn(0);
473 }
474 
475 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
476 {
477   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
478   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
479   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
480   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
481   cusparseStatus_t                  stat;
482   PetscErrorCode                    ierr;
483   PetscInt                          *AiUp, *AjUp;
484   PetscScalar                       *AAUp;
485   PetscScalar                       *AALo;
486   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
487   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
488   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
489   const MatScalar                   *aa = b->a,*v;
490 
491   PetscFunctionBegin;
492   if (A->valid_GPU_matrix == PETSC_CUDA_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUDA_CPU) {
493     try {
494       /* Allocate Space for the upper triangular matrix */
495       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
496       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr);
497       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
498       ierr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
499 
500       /* Fill the upper triangular matrix */
501       AiUp[0]=(PetscInt) 0;
502       AiUp[n]=nzUpper;
503       offset = 0;
504       for (i=0; i<n; i++) {
505         /* set the pointers */
506         v  = aa + ai[i];
507         vj = aj + ai[i];
508         nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
509 
510         /* first, set the diagonal elements */
511         AjUp[offset] = (PetscInt) i;
512         AAUp[offset] = (MatScalar)1.0/v[nz];
513         AiUp[i]      = offset;
514         AALo[offset] = (MatScalar)1.0/v[nz];
515 
516         offset+=1;
517         if (nz>0) {
518           ierr = PetscMemcpy(&(AjUp[offset]), vj, nz*sizeof(PetscInt));CHKERRQ(ierr);
519           ierr = PetscMemcpy(&(AAUp[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
520           for (j=offset; j<offset+nz; j++) {
521             AAUp[j] = -AAUp[j];
522             AALo[j] = AAUp[j]/v[nz];
523           }
524           offset+=nz;
525         }
526       }
527 
528       /* allocate space for the triangular factor information */
529       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
530 
531       /* Create the matrix description */
532       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat);
533       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
534       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
535       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
536       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat);
537 
538       /* Create the solve analysis information */
539       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat);
540 
541       /* set the operation */
542       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
543 
544       /* set the matrix */
545       upTriFactor->csrMat = new CsrMatrix;
546       upTriFactor->csrMat->num_rows = A->rmap->n;
547       upTriFactor->csrMat->num_cols = A->cmap->n;
548       upTriFactor->csrMat->num_entries = a->nz;
549 
550       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
551       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
552 
553       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
554       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
555 
556       upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
557       upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
558 
559       /* perform the solve analysis */
560       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
561                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
562                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
563                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat);
564 
565       /* assign the pointer. Is this really necessary? */
566       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
567 
568       /* allocate space for the triangular factor information */
569       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
570 
571       /* Create the matrix description */
572       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat);
573       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
574       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
575       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
576       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat);
577 
578       /* Create the solve analysis information */
579       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat);
580 
581       /* set the operation */
582       loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
583 
584       /* set the matrix */
585       loTriFactor->csrMat = new CsrMatrix;
586       loTriFactor->csrMat->num_rows = A->rmap->n;
587       loTriFactor->csrMat->num_cols = A->cmap->n;
588       loTriFactor->csrMat->num_entries = a->nz;
589 
590       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
591       loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
592 
593       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
594       loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
595 
596       loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
597       loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
598 
599       /* perform the solve analysis */
600       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
601                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
602                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
603                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat);
604 
605       /* assign the pointer. Is this really necessary? */
606       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
607 
608       A->valid_GPU_matrix = PETSC_CUDA_BOTH;
609       ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr);
610       ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr);
611       ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr);
612       ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr);
613     } catch(char *ex) {
614       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
615     }
616   }
617   PetscFunctionReturn(0);
618 }
619 
620 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
621 {
622   PetscErrorCode               ierr;
623   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
624   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
625   IS                           ip = a->row;
626   const PetscInt               *rip;
627   PetscBool                    perm_identity;
628   PetscInt                     n = A->rmap->n;
629 
630   PetscFunctionBegin;
631   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
632   cusparseTriFactors->workVector = new THRUSTARRAY;
633   cusparseTriFactors->workVector->resize(n);
634   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
635 
636   /*lower triangular indices */
637   ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
638   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
639   if (!perm_identity) {
640     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
641     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
642     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
643     cusparseTriFactors->cpermIndices->assign(rip, rip+n);
644   }
645   ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
646   PetscFunctionReturn(0);
647 }
648 
649 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
650 {
651   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
652   IS             isrow = b->row,iscol = b->col;
653   PetscBool      row_identity,col_identity;
654   PetscErrorCode ierr;
655 
656   PetscFunctionBegin;
657   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
658   /* determine which version of MatSolve needs to be used. */
659   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
660   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
661   if (row_identity && col_identity) {
662     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
663     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
664   } else {
665     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
666     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
667   }
668 
669   /* get the triangular factors */
670   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
671   PetscFunctionReturn(0);
672 }
673 
674 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
675 {
676   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
677   IS             ip = b->row;
678   PetscBool      perm_identity;
679   PetscErrorCode ierr;
680 
681   PetscFunctionBegin;
682   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
683 
684   /* determine which version of MatSolve needs to be used. */
685   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
686   if (perm_identity) {
687     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
688     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
689   } else {
690     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
691     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
692   }
693 
694   /* get the triangular factors */
695   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
696   PetscFunctionReturn(0);
697 }
698 
699 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
700 {
701   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
702   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
703   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
704   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
705   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
706   cusparseStatus_t                  stat;
707   cusparseIndexBase_t               indexBase;
708   cusparseMatrixType_t              matrixType;
709   cusparseFillMode_t                fillMode;
710   cusparseDiagType_t                diagType;
711 
712   PetscFunctionBegin;
713 
714   /*********************************************/
715   /* Now the Transpose of the Lower Tri Factor */
716   /*********************************************/
717 
718   /* allocate space for the transpose of the lower triangular factor */
719   loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
720 
721   /* set the matrix descriptors of the lower triangular factor */
722   matrixType = cusparseGetMatType(loTriFactor->descr);
723   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
724   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
725     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
726   diagType = cusparseGetMatDiagType(loTriFactor->descr);
727 
728   /* Create the matrix description */
729   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUDA(stat);
730   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUDA(stat);
731   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUDA(stat);
732   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUDA(stat);
733   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUDA(stat);
734 
735   /* Create the solve analysis information */
736   stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUDA(stat);
737 
738   /* set the operation */
739   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
740 
741   /* allocate GPU space for the CSC of the lower triangular factor*/
742   loTriFactorT->csrMat = new CsrMatrix;
743   loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows;
744   loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols;
745   loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
746   loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1);
747   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries);
748   loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries);
749 
750   /* compute the transpose of the lower triangular factor, i.e. the CSC */
751   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
752                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
753                           loTriFactor->csrMat->values->data().get(),
754                           loTriFactor->csrMat->row_offsets->data().get(),
755                           loTriFactor->csrMat->column_indices->data().get(),
756                           loTriFactorT->csrMat->values->data().get(),
757                           loTriFactorT->csrMat->column_indices->data().get(),
758                           loTriFactorT->csrMat->row_offsets->data().get(),
759                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
760 
761   /* perform the solve analysis on the transposed matrix */
762   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
763                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries,
764                            loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
765                            loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(),
766                            loTriFactorT->solveInfo);CHKERRCUDA(stat);
767 
768   /* assign the pointer. Is this really necessary? */
769   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
770 
771   /*********************************************/
772   /* Now the Transpose of the Upper Tri Factor */
773   /*********************************************/
774 
775   /* allocate space for the transpose of the upper triangular factor */
776   upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
777 
778   /* set the matrix descriptors of the upper triangular factor */
779   matrixType = cusparseGetMatType(upTriFactor->descr);
780   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
781   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
782     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
783   diagType = cusparseGetMatDiagType(upTriFactor->descr);
784 
785   /* Create the matrix description */
786   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUDA(stat);
787   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUDA(stat);
788   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUDA(stat);
789   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUDA(stat);
790   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUDA(stat);
791 
792   /* Create the solve analysis information */
793   stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUDA(stat);
794 
795   /* set the operation */
796   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
797 
798   /* allocate GPU space for the CSC of the upper triangular factor*/
799   upTriFactorT->csrMat = new CsrMatrix;
800   upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows;
801   upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols;
802   upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
803   upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1);
804   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries);
805   upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries);
806 
807   /* compute the transpose of the upper triangular factor, i.e. the CSC */
808   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
809                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
810                           upTriFactor->csrMat->values->data().get(),
811                           upTriFactor->csrMat->row_offsets->data().get(),
812                           upTriFactor->csrMat->column_indices->data().get(),
813                           upTriFactorT->csrMat->values->data().get(),
814                           upTriFactorT->csrMat->column_indices->data().get(),
815                           upTriFactorT->csrMat->row_offsets->data().get(),
816                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
817 
818   /* perform the solve analysis on the transposed matrix */
819   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
820                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries,
821                            upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
822                            upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(),
823                            upTriFactorT->solveInfo);CHKERRCUDA(stat);
824 
825   /* assign the pointer. Is this really necessary? */
826   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
827   PetscFunctionReturn(0);
828 }
829 
830 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
831 {
832   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
833   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
834   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
835   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
836   cusparseStatus_t             stat;
837   cusparseIndexBase_t          indexBase;
838   cudaError_t                  err;
839 
840   PetscFunctionBegin;
841 
842   /* allocate space for the triangular factor information */
843   matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
844   stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUDA(stat);
845   indexBase = cusparseGetMatIndexBase(matstruct->descr);
846   stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUDA(stat);
847   stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat);
848 
849   /* set alpha and beta */
850   err = cudaMalloc((void **)&(matstructT->alpha),sizeof(PetscScalar));CHKERRCUDA(err);
851   err = cudaMemcpy(matstructT->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
852   err = cudaMalloc((void **)&(matstructT->beta),sizeof(PetscScalar));CHKERRCUDA(err);
853   err = cudaMemcpy(matstructT->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
854   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
855 
856   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
857     CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
858     CsrMatrix *matrixT= new CsrMatrix;
859     matrixT->num_rows = A->rmap->n;
860     matrixT->num_cols = A->cmap->n;
861     matrixT->num_entries = a->nz;
862     matrixT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
863     matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
864     matrixT->values = new THRUSTARRAY(a->nz);
865 
866     /* compute the transpose of the upper triangular factor, i.e. the CSC */
867     indexBase = cusparseGetMatIndexBase(matstruct->descr);
868     stat = cusparse_csr2csc(cusparsestruct->handle, matrix->num_rows,
869                             matrix->num_cols, matrix->num_entries,
870                             matrix->values->data().get(),
871                             matrix->row_offsets->data().get(),
872                             matrix->column_indices->data().get(),
873                             matrixT->values->data().get(),
874                             matrixT->column_indices->data().get(),
875                             matrixT->row_offsets->data().get(),
876                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
877 
878     /* assign the pointer */
879     matstructT->mat = matrixT;
880 
881   } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
882 #if CUDA_VERSION>=5000
883     /* First convert HYB to CSR */
884     CsrMatrix *temp= new CsrMatrix;
885     temp->num_rows = A->rmap->n;
886     temp->num_cols = A->cmap->n;
887     temp->num_entries = a->nz;
888     temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
889     temp->column_indices = new THRUSTINTARRAY32(a->nz);
890     temp->values = new THRUSTARRAY(a->nz);
891 
892 
893     stat = cusparse_hyb2csr(cusparsestruct->handle,
894                             matstruct->descr, (cusparseHybMat_t)matstruct->mat,
895                             temp->values->data().get(),
896                             temp->row_offsets->data().get(),
897                             temp->column_indices->data().get());CHKERRCUDA(stat);
898 
899     /* Next, convert CSR to CSC (i.e. the matrix transpose) */
900     CsrMatrix *tempT= new CsrMatrix;
901     tempT->num_rows = A->rmap->n;
902     tempT->num_cols = A->cmap->n;
903     tempT->num_entries = a->nz;
904     tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
905     tempT->column_indices = new THRUSTINTARRAY32(a->nz);
906     tempT->values = new THRUSTARRAY(a->nz);
907 
908     stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
909                             temp->num_cols, temp->num_entries,
910                             temp->values->data().get(),
911                             temp->row_offsets->data().get(),
912                             temp->column_indices->data().get(),
913                             tempT->values->data().get(),
914                             tempT->column_indices->data().get(),
915                             tempT->row_offsets->data().get(),
916                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
917 
918     /* Last, convert CSC to HYB */
919     cusparseHybMat_t hybMat;
920     stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat);
921     cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
922       CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
923     stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
924                             matstructT->descr, tempT->values->data().get(),
925                             tempT->row_offsets->data().get(),
926                             tempT->column_indices->data().get(),
927                             hybMat, 0, partition);CHKERRCUDA(stat);
928 
929     /* assign the pointer */
930     matstructT->mat = hybMat;
931 
932     /* delete temporaries */
933     if (tempT) {
934       if (tempT->values) delete (THRUSTARRAY*) tempT->values;
935       if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
936       if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
937       delete (CsrMatrix*) tempT;
938     }
939     if (temp) {
940       if (temp->values) delete (THRUSTARRAY*) temp->values;
941       if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
942       if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
943       delete (CsrMatrix*) temp;
944     }
945 #else
946     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format for the Matrix Transpose (in MatMultTranspose) require CUDA 5.0 or later.");
947 #endif
948   }
949   /* assign the compressed row indices */
950   matstructT->cprowIndices = new THRUSTINTARRAY;
951 
952   /* assign the pointer */
953   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
954   PetscFunctionReturn(0);
955 }
956 
957 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
958 {
959   PetscInt                              n = xx->map->n;
960   const PetscScalar                     *barray;
961   PetscScalar                           *xarray;
962   thrust::device_ptr<const PetscScalar> bGPU;
963   thrust::device_ptr<PetscScalar>       xGPU;
964   cusparseStatus_t                      stat;
965   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
966   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
967   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
968   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
969   PetscErrorCode                        ierr;
970 
971   PetscFunctionBegin;
972   /* Analyze the matrix and create the transpose ... on the fly */
973   if (!loTriFactorT && !upTriFactorT) {
974     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
975     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
976     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
977   }
978 
979   /* Get the GPU pointers */
980   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
981   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
982   xGPU = thrust::device_pointer_cast(xarray);
983   bGPU = thrust::device_pointer_cast(barray);
984 
985   /* First, reorder with the row permutation */
986   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
987                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
988                xGPU);
989 
990   /* First, solve U */
991   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
992                         upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr,
993                         upTriFactorT->csrMat->values->data().get(),
994                         upTriFactorT->csrMat->row_offsets->data().get(),
995                         upTriFactorT->csrMat->column_indices->data().get(),
996                         upTriFactorT->solveInfo,
997                         xarray, tempGPU->data().get());CHKERRCUDA(stat);
998 
999   /* Then, solve L */
1000   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1001                         loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr,
1002                         loTriFactorT->csrMat->values->data().get(),
1003                         loTriFactorT->csrMat->row_offsets->data().get(),
1004                         loTriFactorT->csrMat->column_indices->data().get(),
1005                         loTriFactorT->solveInfo,
1006                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1007 
1008   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1009   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1010                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1011                tempGPU->begin());
1012 
1013   /* Copy the temporary to the full solution. */
1014   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1015 
1016   /* restore */
1017   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1018   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1019   ierr = WaitForGPU();CHKERRCUDA(ierr);
1020 
1021   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1022   PetscFunctionReturn(0);
1023 }
1024 
1025 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1026 {
1027   const PetscScalar                 *barray;
1028   PetscScalar                       *xarray;
1029   cusparseStatus_t                  stat;
1030   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1031   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1032   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1033   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1034   PetscErrorCode                    ierr;
1035 
1036   PetscFunctionBegin;
1037   /* Analyze the matrix and create the transpose ... on the fly */
1038   if (!loTriFactorT && !upTriFactorT) {
1039     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1040     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1041     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1042   }
1043 
1044   /* Get the GPU pointers */
1045   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1046   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1047 
1048   /* First, solve U */
1049   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1050                         upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr,
1051                         upTriFactorT->csrMat->values->data().get(),
1052                         upTriFactorT->csrMat->row_offsets->data().get(),
1053                         upTriFactorT->csrMat->column_indices->data().get(),
1054                         upTriFactorT->solveInfo,
1055                         barray, tempGPU->data().get());CHKERRCUDA(stat);
1056 
1057   /* Then, solve L */
1058   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1059                         loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr,
1060                         loTriFactorT->csrMat->values->data().get(),
1061                         loTriFactorT->csrMat->row_offsets->data().get(),
1062                         loTriFactorT->csrMat->column_indices->data().get(),
1063                         loTriFactorT->solveInfo,
1064                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1065 
1066   /* restore */
1067   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1068   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1069   ierr = WaitForGPU();CHKERRCUDA(ierr);
1070   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1071   PetscFunctionReturn(0);
1072 }
1073 
1074 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1075 {
1076   const PetscScalar                     *barray;
1077   PetscScalar                           *xarray;
1078   thrust::device_ptr<const PetscScalar> bGPU;
1079   thrust::device_ptr<PetscScalar>       xGPU;
1080   cusparseStatus_t                      stat;
1081   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1082   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1083   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1084   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1085   PetscErrorCode                        ierr;
1086   VecType                               t;
1087   PetscBool                             flg;
1088 
1089   PetscFunctionBegin;
1090   ierr = VecGetType(bb,&t);CHKERRQ(ierr);
1091   ierr = PetscStrcmp(t,VECSEQCUDA,&flg);CHKERRQ(ierr);
1092   if (!flg) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Vector of type %s passed into MatSolve_SeqAIJCUSPARSE (Arg #2). Can only deal with %s\n.",t,VECSEQCUDA);
1093   ierr = VecGetType(xx,&t);CHKERRQ(ierr);
1094   ierr = PetscStrcmp(t,VECSEQCUDA,&flg);CHKERRQ(ierr);
1095   if (!flg) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Vector of type %s passed into MatSolve_SeqAIJCUSPARSE (Arg #3). Can only deal with %s\n.",t,VECSEQCUDA);
1096 
1097   /* Get the GPU pointers */
1098   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1099   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1100   xGPU = thrust::device_pointer_cast(xarray);
1101   bGPU = thrust::device_pointer_cast(barray);
1102 
1103   /* First, reorder with the row permutation */
1104   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1105                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1106                xGPU);
1107 
1108   /* Next, solve L */
1109   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1110                         loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr,
1111                         loTriFactor->csrMat->values->data().get(),
1112                         loTriFactor->csrMat->row_offsets->data().get(),
1113                         loTriFactor->csrMat->column_indices->data().get(),
1114                         loTriFactor->solveInfo,
1115                         xarray, tempGPU->data().get());CHKERRCUDA(stat);
1116 
1117   /* Then, solve U */
1118   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1119                         upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr,
1120                         upTriFactor->csrMat->values->data().get(),
1121                         upTriFactor->csrMat->row_offsets->data().get(),
1122                         upTriFactor->csrMat->column_indices->data().get(),
1123                         upTriFactor->solveInfo,
1124                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1125 
1126   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1127   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1128                thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->end()),
1129                tempGPU->begin());
1130 
1131   /* Copy the temporary to the full solution. */
1132   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1133 
1134   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1135   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1136   ierr = WaitForGPU();CHKERRCUDA(ierr);
1137   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1138   PetscFunctionReturn(0);
1139 }
1140 
1141 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1142 {
1143   const PetscScalar                 *barray;
1144   PetscScalar                       *xarray;
1145   cusparseStatus_t                  stat;
1146   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1147   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1148   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1149   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1150   PetscErrorCode                    ierr;
1151 
1152   PetscFunctionBegin;
1153   /* Get the GPU pointers */
1154   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1155   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1156 
1157   /* First, solve L */
1158   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1159                         loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr,
1160                         loTriFactor->csrMat->values->data().get(),
1161                         loTriFactor->csrMat->row_offsets->data().get(),
1162                         loTriFactor->csrMat->column_indices->data().get(),
1163                         loTriFactor->solveInfo,
1164                         barray, tempGPU->data().get());CHKERRCUDA(stat);
1165 
1166   /* Next, solve U */
1167   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1168                         upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr,
1169                         upTriFactor->csrMat->values->data().get(),
1170                         upTriFactor->csrMat->row_offsets->data().get(),
1171                         upTriFactor->csrMat->column_indices->data().get(),
1172                         upTriFactor->solveInfo,
1173                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1174 
1175   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1176   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1177   ierr = WaitForGPU();CHKERRCUDA(ierr);
1178   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1179   PetscFunctionReturn(0);
1180 }
1181 
1182 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1183 {
1184 
1185   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1186   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1187   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1188   PetscInt                     m = A->rmap->n,*ii,*ridx;
1189   PetscErrorCode               ierr;
1190   cusparseStatus_t             stat;
1191   cudaError_t                  err;
1192 
1193   PetscFunctionBegin;
1194   if (A->valid_GPU_matrix == PETSC_CUDA_UNALLOCATED || A->valid_GPU_matrix == PETSC_CUDA_CPU) {
1195     ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1196     if (A->assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) {
1197       CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
1198       /* copy values only */
1199       matrix->values->assign(a->a, a->a+a->nz);
1200     } else {
1201       Mat_SeqAIJCUSPARSEMultStruct_Destroy(&matstruct,cusparsestruct->format);
1202       try {
1203         cusparsestruct->nonzerorow=0;
1204         for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0);
1205 
1206         if (a->compressedrow.use) {
1207           m    = a->compressedrow.nrows;
1208           ii   = a->compressedrow.i;
1209           ridx = a->compressedrow.rindex;
1210         } else {
1211           /* Forcing compressed row on the GPU */
1212           int k=0;
1213           ierr = PetscMalloc1(cusparsestruct->nonzerorow+1, &ii);CHKERRQ(ierr);
1214           ierr = PetscMalloc1(cusparsestruct->nonzerorow, &ridx);CHKERRQ(ierr);
1215           ii[0]=0;
1216           for (int j = 0; j<m; j++) {
1217             if ((a->i[j+1]-a->i[j])>0) {
1218               ii[k]  = a->i[j];
1219               ridx[k]= j;
1220               k++;
1221             }
1222           }
1223           ii[cusparsestruct->nonzerorow] = a->nz;
1224           m = cusparsestruct->nonzerorow;
1225         }
1226 
1227         /* allocate space for the triangular factor information */
1228         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1229         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUDA(stat);
1230         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
1231         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat);
1232 
1233         err = cudaMalloc((void **)&(matstruct->alpha),sizeof(PetscScalar));CHKERRCUDA(err);
1234         err = cudaMemcpy(matstruct->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1235         err = cudaMalloc((void **)&(matstruct->beta),sizeof(PetscScalar));CHKERRCUDA(err);
1236         err = cudaMemcpy(matstruct->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1237         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
1238 
1239         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1240         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1241           /* set the matrix */
1242           CsrMatrix *matrix= new CsrMatrix;
1243           matrix->num_rows = m;
1244           matrix->num_cols = A->cmap->n;
1245           matrix->num_entries = a->nz;
1246           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1247           matrix->row_offsets->assign(ii, ii + m+1);
1248 
1249           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1250           matrix->column_indices->assign(a->j, a->j+a->nz);
1251 
1252           matrix->values = new THRUSTARRAY(a->nz);
1253           matrix->values->assign(a->a, a->a+a->nz);
1254 
1255           /* assign the pointer */
1256           matstruct->mat = matrix;
1257 
1258         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1259 #if CUDA_VERSION>=4020
1260           CsrMatrix *matrix= new CsrMatrix;
1261           matrix->num_rows = m;
1262           matrix->num_cols = A->cmap->n;
1263           matrix->num_entries = a->nz;
1264           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1265           matrix->row_offsets->assign(ii, ii + m+1);
1266 
1267           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1268           matrix->column_indices->assign(a->j, a->j+a->nz);
1269 
1270           matrix->values = new THRUSTARRAY(a->nz);
1271           matrix->values->assign(a->a, a->a+a->nz);
1272 
1273           cusparseHybMat_t hybMat;
1274           stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat);
1275           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1276             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1277           stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols,
1278               matstruct->descr, matrix->values->data().get(),
1279               matrix->row_offsets->data().get(),
1280               matrix->column_indices->data().get(),
1281               hybMat, 0, partition);CHKERRCUDA(stat);
1282           /* assign the pointer */
1283           matstruct->mat = hybMat;
1284 
1285           if (matrix) {
1286             if (matrix->values) delete (THRUSTARRAY*)matrix->values;
1287             if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices;
1288             if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets;
1289             delete (CsrMatrix*)matrix;
1290           }
1291 #endif
1292         }
1293 
1294         /* assign the compressed row indices */
1295         matstruct->cprowIndices = new THRUSTINTARRAY(m);
1296         matstruct->cprowIndices->assign(ridx,ridx+m);
1297 
1298         /* assign the pointer */
1299         cusparsestruct->mat = matstruct;
1300 
1301         if (!a->compressedrow.use) {
1302           ierr = PetscFree(ii);CHKERRQ(ierr);
1303           ierr = PetscFree(ridx);CHKERRQ(ierr);
1304         }
1305         cusparsestruct->workVector = new THRUSTARRAY;
1306         cusparsestruct->workVector->resize(m);
1307       } catch(char *ex) {
1308         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1309       }
1310       cusparsestruct->nonzerostate = A->nonzerostate;
1311     }
1312     ierr = WaitForGPU();CHKERRCUDA(ierr);
1313     A->valid_GPU_matrix = PETSC_CUDA_BOTH;
1314     ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1315   }
1316   PetscFunctionReturn(0);
1317 }
1318 
1319 static PetscErrorCode MatCreateVecs_SeqAIJCUSPARSE(Mat mat, Vec *right, Vec *left)
1320 {
1321   PetscErrorCode ierr;
1322   PetscInt rbs,cbs;
1323 
1324   PetscFunctionBegin;
1325   ierr = MatGetBlockSizes(mat,&rbs,&cbs);CHKERRQ(ierr);
1326   if (right) {
1327     ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr);
1328     ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
1329     ierr = VecSetBlockSize(*right,cbs);CHKERRQ(ierr);
1330     ierr = VecSetType(*right,VECSEQCUDA);CHKERRQ(ierr);
1331     ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr);
1332   }
1333   if (left) {
1334     ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr);
1335     ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
1336     ierr = VecSetBlockSize(*left,rbs);CHKERRQ(ierr);
1337     ierr = VecSetType(*left,VECSEQCUDA);CHKERRQ(ierr);
1338     ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr);
1339   }
1340   PetscFunctionReturn(0);
1341 }
1342 
1343 struct VecCUDAPlusEquals
1344 {
1345   template <typename Tuple>
1346   __host__ __device__
1347   void operator()(Tuple t)
1348   {
1349     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1350   }
1351 };
1352 
1353 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1354 {
1355   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1356   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1357   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1358   const PetscScalar            *xarray;
1359   PetscScalar                  *yarray;
1360   PetscErrorCode               ierr;
1361   cusparseStatus_t             stat;
1362 
1363   PetscFunctionBegin;
1364   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1365   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1366   ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1367   ierr = VecCUDAGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
1368   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1369     CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1370     stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1371                              mat->num_rows, mat->num_cols, mat->num_entries,
1372                              matstruct->alpha, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(),
1373                              mat->column_indices->data().get(), xarray, matstruct->beta,
1374                              yarray);CHKERRCUDA(stat);
1375   } else {
1376 #if CUDA_VERSION>=4020
1377     cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1378     stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1379                              matstruct->alpha, matstruct->descr, hybMat,
1380                              xarray, matstruct->beta,
1381                              yarray);CHKERRCUDA(stat);
1382 #endif
1383   }
1384   ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1385   ierr = VecCUDARestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
1386   if (!cusparsestruct->stream) {
1387     ierr = WaitForGPU();CHKERRCUDA(ierr);
1388   }
1389   ierr = PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr);
1390   PetscFunctionReturn(0);
1391 }
1392 
1393 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1394 {
1395   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1396   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1397   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1398   const PetscScalar            *xarray;
1399   PetscScalar                  *yarray;
1400   PetscErrorCode               ierr;
1401   cusparseStatus_t             stat;
1402 
1403   PetscFunctionBegin;
1404   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1405   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1406   if (!matstructT) {
1407     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1408     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1409   }
1410   ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1411   ierr = VecCUDAGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
1412 
1413   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1414     CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1415     stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1416                              mat->num_rows, mat->num_cols,
1417                              mat->num_entries, matstructT->alpha, matstructT->descr,
1418                              mat->values->data().get(), mat->row_offsets->data().get(),
1419                              mat->column_indices->data().get(), xarray, matstructT->beta,
1420                              yarray);CHKERRCUDA(stat);
1421   } else {
1422 #if CUDA_VERSION>=4020
1423     cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1424     stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1425                              matstructT->alpha, matstructT->descr, hybMat,
1426                              xarray, matstructT->beta,
1427                              yarray);CHKERRCUDA(stat);
1428 #endif
1429   }
1430   ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1431   ierr = VecCUDARestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
1432   if (!cusparsestruct->stream) {
1433     ierr = WaitForGPU();CHKERRCUDA(ierr);
1434   }
1435   ierr = PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr);
1436   PetscFunctionReturn(0);
1437 }
1438 
1439 
1440 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1441 {
1442   Mat_SeqAIJ                      *a = (Mat_SeqAIJ*)A->data;
1443   Mat_SeqAIJCUSPARSE              *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1444   Mat_SeqAIJCUSPARSEMultStruct    *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1445   thrust::device_ptr<PetscScalar> zptr;
1446   const PetscScalar               *xarray;
1447   PetscScalar                     *zarray;
1448   PetscErrorCode                  ierr;
1449   cusparseStatus_t                stat;
1450 
1451   PetscFunctionBegin;
1452   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1453   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1454   try {
1455     ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);
1456     ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1457     ierr = VecCUDAGetArrayReadWrite(zz,&zarray);CHKERRQ(ierr);
1458     zptr = thrust::device_pointer_cast(zarray);
1459 
1460     /* multiply add */
1461     if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1462       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1463     /* here we need to be careful to set the number of rows in the multiply to the
1464        number of compressed rows in the matrix ... which is equivalent to the
1465        size of the workVector */
1466       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1467                                mat->num_rows, mat->num_cols,
1468                                mat->num_entries, matstruct->alpha, matstruct->descr,
1469                                mat->values->data().get(), mat->row_offsets->data().get(),
1470                                mat->column_indices->data().get(), xarray, matstruct->beta,
1471                                cusparsestruct->workVector->data().get());CHKERRCUDA(stat);
1472     } else {
1473 #if CUDA_VERSION>=4020
1474       cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1475       if (cusparsestruct->workVector->size()) {
1476         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1477             matstruct->alpha, matstruct->descr, hybMat,
1478             xarray, matstruct->beta,
1479             cusparsestruct->workVector->data().get());CHKERRCUDA(stat);
1480       }
1481 #endif
1482     }
1483 
1484     /* scatter the data from the temporary into the full vector with a += operation */
1485     thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
1486         thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(),
1487         VecCUDAPlusEquals());
1488     ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1489     ierr = VecCUDARestoreArrayReadWrite(zz,&zarray);CHKERRQ(ierr);
1490 
1491   } catch(char *ex) {
1492     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1493   }
1494   ierr = WaitForGPU();CHKERRCUDA(ierr);
1495   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1496   PetscFunctionReturn(0);
1497 }
1498 
1499 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1500 {
1501   Mat_SeqAIJ                      *a = (Mat_SeqAIJ*)A->data;
1502   Mat_SeqAIJCUSPARSE              *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1503   Mat_SeqAIJCUSPARSEMultStruct    *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1504   thrust::device_ptr<PetscScalar> zptr;
1505   const PetscScalar               *xarray;
1506   PetscScalar                     *zarray;
1507   PetscErrorCode                  ierr;
1508   cusparseStatus_t                stat;
1509 
1510   PetscFunctionBegin;
1511   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1512   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1513   if (!matstructT) {
1514     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1515     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1516   }
1517 
1518   try {
1519     ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);
1520     ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1521     ierr = VecCUDAGetArrayReadWrite(zz,&zarray);CHKERRQ(ierr);
1522     zptr = thrust::device_pointer_cast(zarray);
1523 
1524     /* multiply add with matrix transpose */
1525     if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1526       CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1527       /* here we need to be careful to set the number of rows in the multiply to the
1528          number of compressed rows in the matrix ... which is equivalent to the
1529          size of the workVector */
1530       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1531                                mat->num_rows, mat->num_cols,
1532                                mat->num_entries, matstructT->alpha, matstructT->descr,
1533                                mat->values->data().get(), mat->row_offsets->data().get(),
1534                                mat->column_indices->data().get(), xarray, matstructT->beta,
1535                                cusparsestruct->workVector->data().get());CHKERRCUDA(stat);
1536     } else {
1537 #if CUDA_VERSION>=4020
1538       cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1539       if (cusparsestruct->workVector->size()) {
1540         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1541             matstructT->alpha, matstructT->descr, hybMat,
1542             xarray, matstructT->beta,
1543             cusparsestruct->workVector->data().get());CHKERRCUDA(stat);
1544       }
1545 #endif
1546     }
1547 
1548     /* scatter the data from the temporary into the full vector with a += operation */
1549     thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))),
1550         thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))) + cusparsestruct->workVector->size(),
1551         VecCUDAPlusEquals());
1552 
1553     ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1554     ierr = VecCUDARestoreArrayReadWrite(zz,&zarray);CHKERRQ(ierr);
1555 
1556   } catch(char *ex) {
1557     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1558   }
1559   ierr = WaitForGPU();CHKERRCUDA(ierr);
1560   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1561   PetscFunctionReturn(0);
1562 }
1563 
1564 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
1565 {
1566   PetscErrorCode ierr;
1567 
1568   PetscFunctionBegin;
1569   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
1570   if (A->factortype==MAT_FACTOR_NONE) {
1571     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1572   }
1573   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
1574   A->ops->mult             = MatMult_SeqAIJCUSPARSE;
1575   A->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1576   A->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1577   A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1578   PetscFunctionReturn(0);
1579 }
1580 
1581 /* --------------------------------------------------------------------------------*/
1582 /*@
1583    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
1584    (the default parallel PETSc format). This matrix will ultimately pushed down
1585    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
1586    assembly performance the user should preallocate the matrix storage by setting
1587    the parameter nz (or the array nnz).  By setting these parameters accurately,
1588    performance during matrix assembly can be increased by more than a factor of 50.
1589 
1590    Collective on MPI_Comm
1591 
1592    Input Parameters:
1593 +  comm - MPI communicator, set to PETSC_COMM_SELF
1594 .  m - number of rows
1595 .  n - number of columns
1596 .  nz - number of nonzeros per row (same for all rows)
1597 -  nnz - array containing the number of nonzeros in the various rows
1598          (possibly different for each row) or NULL
1599 
1600    Output Parameter:
1601 .  A - the matrix
1602 
1603    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1604    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1605    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
1606 
1607    Notes:
1608    If nnz is given then nz is ignored
1609 
1610    The AIJ format (also called the Yale sparse matrix format or
1611    compressed row storage), is fully compatible with standard Fortran 77
1612    storage.  That is, the stored row and column indices can begin at
1613    either one (as in Fortran) or zero.  See the users' manual for details.
1614 
1615    Specify the preallocated storage with either nz or nnz (not both).
1616    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1617    allocation.  For large problems you MUST preallocate memory or you
1618    will get TERRIBLE performance, see the users' manual chapter on matrices.
1619 
1620    By default, this format uses inodes (identical nodes) when possible, to
1621    improve numerical efficiency of matrix-vector products and solves. We
1622    search for consecutive rows with the same nonzero structure, thereby
1623    reusing matrix information to achieve increased efficiency.
1624 
1625    Level: intermediate
1626 
1627 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
1628 @*/
1629 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1630 {
1631   PetscErrorCode ierr;
1632 
1633   PetscFunctionBegin;
1634   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1635   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
1636   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1637   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
1638   PetscFunctionReturn(0);
1639 }
1640 
1641 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
1642 {
1643   PetscErrorCode   ierr;
1644 
1645   PetscFunctionBegin;
1646   if (A->factortype==MAT_FACTOR_NONE) {
1647     if (A->valid_GPU_matrix != PETSC_CUDA_UNALLOCATED) {
1648       ierr = Mat_SeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
1649     }
1650   } else {
1651     ierr = Mat_SeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
1652   }
1653   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
1654   PetscFunctionReturn(0);
1655 }
1656 
1657 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
1658 {
1659   PetscErrorCode ierr;
1660   cusparseStatus_t stat;
1661   cusparseHandle_t handle=0;
1662 
1663   PetscFunctionBegin;
1664   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
1665   if (B->factortype==MAT_FACTOR_NONE) {
1666     /* you cannot check the inode.use flag here since the matrix was just created.
1667        now build a GPU matrix data structure */
1668     B->spptr = new Mat_SeqAIJCUSPARSE;
1669     ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat          = 0;
1670     ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0;
1671     ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector   = 0;
1672     ((Mat_SeqAIJCUSPARSE*)B->spptr)->format       = MAT_CUSPARSE_CSR;
1673     ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream       = 0;
1674     ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle       = 0;
1675     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1676     ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle       = handle;
1677     ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream       = 0;
1678   } else {
1679     /* NEXT, set the pointers to the triangular factors */
1680     B->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1681     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr          = 0;
1682     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr          = 0;
1683     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0;
1684     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0;
1685     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices            = 0;
1686     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices            = 0;
1687     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector              = 0;
1688     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle                  = 0;
1689     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1690     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle                  = handle;
1691     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz                     = 0;
1692   }
1693 
1694   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
1695   B->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
1696   B->ops->getvecs          = MatCreateVecs_SeqAIJCUSPARSE;
1697   B->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
1698   B->ops->mult             = MatMult_SeqAIJCUSPARSE;
1699   B->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1700   B->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1701   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1702 
1703   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1704 
1705   B->valid_GPU_matrix = PETSC_CUDA_UNALLOCATED;
1706 
1707   ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
1708   PetscFunctionReturn(0);
1709 }
1710 
1711 /*M
1712    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
1713 
1714    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
1715    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
1716    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
1717 
1718    Options Database Keys:
1719 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
1720 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1721 .  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1722 
1723   Level: beginner
1724 
1725 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
1726 M*/
1727 
1728 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);
1729 
1730 
1731 PETSC_EXTERN PetscErrorCode MatSolverPackageRegister_CUSPARSE(void)
1732 {
1733   PetscErrorCode ierr;
1734 
1735   PetscFunctionBegin;
1736   ierr = MatSolverPackageRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1737   ierr = MatSolverPackageRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1738   ierr = MatSolverPackageRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1739   ierr = MatSolverPackageRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1740   PetscFunctionReturn(0);
1741 }
1742 
1743 
1744 static PetscErrorCode Mat_SeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
1745 {
1746   cusparseStatus_t stat;
1747   cusparseHandle_t handle;
1748 
1749   PetscFunctionBegin;
1750   if (*cusparsestruct) {
1751     Mat_SeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);
1752     Mat_SeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);
1753     delete (*cusparsestruct)->workVector;
1754     if (handle = (*cusparsestruct)->handle) {
1755       stat = cusparseDestroy(handle);CHKERRCUDA(stat);
1756     }
1757     delete *cusparsestruct;
1758     *cusparsestruct = 0;
1759   }
1760   PetscFunctionReturn(0);
1761 }
1762 
1763 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
1764 {
1765   PetscFunctionBegin;
1766   if (*mat) {
1767     delete (*mat)->values;
1768     delete (*mat)->column_indices;
1769     delete (*mat)->row_offsets;
1770     delete *mat;
1771     *mat = 0;
1772   }
1773   PetscFunctionReturn(0);
1774 }
1775 
1776 static PetscErrorCode Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
1777 {
1778   cusparseStatus_t stat;
1779   PetscErrorCode   ierr;
1780 
1781   PetscFunctionBegin;
1782   if (*trifactor) {
1783     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUDA(stat); }
1784     if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUDA(stat); }
1785     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
1786     delete *trifactor;
1787     *trifactor = 0;
1788   }
1789   PetscFunctionReturn(0);
1790 }
1791 
1792 static PetscErrorCode Mat_SeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
1793 {
1794   CsrMatrix        *mat;
1795   cusparseStatus_t stat;
1796   cudaError_t      err;
1797 
1798   PetscFunctionBegin;
1799   if (*matstruct) {
1800     if ((*matstruct)->mat) {
1801       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
1802         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
1803         stat = cusparseDestroyHybMat(hybMat);CHKERRCUDA(stat);
1804       } else {
1805         mat = (CsrMatrix*)(*matstruct)->mat;
1806         CsrMatrix_Destroy(&mat);
1807       }
1808     }
1809     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUDA(stat); }
1810     delete (*matstruct)->cprowIndices;
1811     if ((*matstruct)->alpha) { err=cudaFree((*matstruct)->alpha);CHKERRCUDA(err); }
1812     if ((*matstruct)->beta) { err=cudaFree((*matstruct)->beta);CHKERRCUDA(err); }
1813     delete *matstruct;
1814     *matstruct = 0;
1815   }
1816   PetscFunctionReturn(0);
1817 }
1818 
1819 static PetscErrorCode Mat_SeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
1820 {
1821   cusparseHandle_t handle;
1822   cusparseStatus_t stat;
1823 
1824   PetscFunctionBegin;
1825   if (*trifactors) {
1826     Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->loTriFactorPtr);
1827     Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->upTriFactorPtr);
1828     Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);
1829     Mat_SeqAIJCUSPARSETriFactorStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);
1830     delete (*trifactors)->rpermIndices;
1831     delete (*trifactors)->cpermIndices;
1832     delete (*trifactors)->workVector;
1833     if (handle = (*trifactors)->handle) {
1834       stat = cusparseDestroy(handle);CHKERRCUDA(stat);
1835     }
1836     delete *trifactors;
1837     *trifactors = 0;
1838   }
1839   PetscFunctionReturn(0);
1840 }
1841 
1842