xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 4863603af06b16768e4e4a060cedf7d42fa55e81)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 
7 #include <petscconf.h>
8 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
9 #include <../src/mat/impls/sbaij/seq/sbaij.h>
10 #include <../src/vec/vec/impls/dvecimpl.h>
11 #include <petsc/private/vecimpl.h>
12 #undef VecType
13 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
14 
15 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
16 
17 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
18 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
19 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
20 
21 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
22 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
23 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
24 
25 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
26 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
27 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
28 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
29 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
30 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
31 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
32 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
33 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
34 
35 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
36 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
37 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
38 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
39 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
40 
41 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
42 {
43   cusparseStatus_t   stat;
44   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
45 
46   PetscFunctionBegin;
47   cusparsestruct->stream = stream;
48   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUDA(stat);
49   PetscFunctionReturn(0);
50 }
51 
52 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
53 {
54   cusparseStatus_t   stat;
55   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
56 
57   PetscFunctionBegin;
58   if (cusparsestruct->handle != handle) {
59     if (cusparsestruct->handle) {
60       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUDA(stat);
61     }
62     cusparsestruct->handle = handle;
63   }
64   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
65   PetscFunctionReturn(0);
66 }
67 
68 PetscErrorCode MatCUSPARSEClearHandle(Mat A)
69 {
70   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
71   PetscFunctionBegin;
72   if (cusparsestruct->handle)
73     cusparsestruct->handle = 0;
74   PetscFunctionReturn(0);
75 }
76 
77 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
78 {
79   PetscFunctionBegin;
80   *type = MATSOLVERCUSPARSE;
81   PetscFunctionReturn(0);
82 }
83 
84 /*MC
85   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
86   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
87   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
88   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
89   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
90   algorithms are not recommended. This class does NOT support direct solver operations.
91 
92   Level: beginner
93 
94 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
95 M*/
96 
97 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
98 {
99   PetscErrorCode ierr;
100   PetscInt       n = A->rmap->n;
101 
102   PetscFunctionBegin;
103   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
104   (*B)->factortype = ftype;
105   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
106   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
107 
108   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
109     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
110     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
111     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
112   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
113     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
114     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
115   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
116 
117   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
118   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
119   PetscFunctionReturn(0);
120 }
121 
122 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
123 {
124   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
125 
126   PetscFunctionBegin;
127 #if CUDA_VERSION>=4020
128   switch (op) {
129   case MAT_CUSPARSE_MULT:
130     cusparsestruct->format = format;
131     break;
132   case MAT_CUSPARSE_ALL:
133     cusparsestruct->format = format;
134     break;
135   default:
136     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
137   }
138 #else
139   if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format require CUDA 4.2 or later.");
140 #endif
141   PetscFunctionReturn(0);
142 }
143 
144 /*@
145    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
146    operation. Only the MatMult operation can use different GPU storage formats
147    for MPIAIJCUSPARSE matrices.
148    Not Collective
149 
150    Input Parameters:
151 +  A - Matrix of type SEQAIJCUSPARSE
152 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
153 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
154 
155    Output Parameter:
156 
157    Level: intermediate
158 
159 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
160 @*/
161 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
162 {
163   PetscErrorCode ierr;
164 
165   PetscFunctionBegin;
166   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
167   ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
168   PetscFunctionReturn(0);
169 }
170 
171 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
172 {
173   PetscErrorCode           ierr;
174   MatCUSPARSEStorageFormat format;
175   PetscBool                flg;
176   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
177 
178   PetscFunctionBegin;
179   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
180   if (A->factortype==MAT_FACTOR_NONE) {
181     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
182                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
183     if (flg) {
184       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);
185     }
186   }
187   ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
188                           "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
189   if (flg) {
190     ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
191   }
192   ierr = PetscOptionsTail();CHKERRQ(ierr);
193   PetscFunctionReturn(0);
194 
195 }
196 
197 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
198 {
199   PetscErrorCode ierr;
200 
201   PetscFunctionBegin;
202   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
203   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
204   PetscFunctionReturn(0);
205 }
206 
207 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
208 {
209   PetscErrorCode ierr;
210 
211   PetscFunctionBegin;
212   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
213   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
214   PetscFunctionReturn(0);
215 }
216 
217 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
218 {
219   PetscErrorCode ierr;
220 
221   PetscFunctionBegin;
222   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
223   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
224   PetscFunctionReturn(0);
225 }
226 
227 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
228 {
229   PetscErrorCode ierr;
230 
231   PetscFunctionBegin;
232   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
233   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
234   PetscFunctionReturn(0);
235 }
236 
237 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
238 {
239   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
240   PetscInt                          n = A->rmap->n;
241   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
242   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
243   cusparseStatus_t                  stat;
244   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
245   const MatScalar                   *aa = a->a,*v;
246   PetscInt                          *AiLo, *AjLo;
247   PetscScalar                       *AALo;
248   PetscInt                          i,nz, nzLower, offset, rowOffset;
249   PetscErrorCode                    ierr;
250 
251   PetscFunctionBegin;
252   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
253     try {
254       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
255       nzLower=n+ai[n]-ai[1];
256 
257       /* Allocate Space for the lower triangular matrix */
258       ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
259       ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(ierr);
260       ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(ierr);
261 
262       /* Fill the lower triangular matrix */
263       AiLo[0]  = (PetscInt) 0;
264       AiLo[n]  = nzLower;
265       AjLo[0]  = (PetscInt) 0;
266       AALo[0]  = (MatScalar) 1.0;
267       v        = aa;
268       vi       = aj;
269       offset   = 1;
270       rowOffset= 1;
271       for (i=1; i<n; i++) {
272         nz = ai[i+1] - ai[i];
273         /* additional 1 for the term on the diagonal */
274         AiLo[i]    = rowOffset;
275         rowOffset += nz+1;
276 
277         ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
278         ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
279 
280         offset      += nz;
281         AjLo[offset] = (PetscInt) i;
282         AALo[offset] = (MatScalar) 1.0;
283         offset      += 1;
284 
285         v  += nz;
286         vi += nz;
287       }
288 
289       /* allocate space for the triangular factor information */
290       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
291 
292       /* Create the matrix description */
293       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat);
294       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
295       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
296       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUDA(stat);
297       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat);
298 
299       /* Create the solve analysis information */
300       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat);
301 
302       /* set the operation */
303       loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
304 
305       /* set the matrix */
306       loTriFactor->csrMat = new CsrMatrix;
307       loTriFactor->csrMat->num_rows = n;
308       loTriFactor->csrMat->num_cols = n;
309       loTriFactor->csrMat->num_entries = nzLower;
310 
311       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
312       loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
313 
314       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
315       loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
316 
317       loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
318       loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
319 
320       /* perform the solve analysis */
321       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
322                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
323                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
324                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat);
325 
326       /* assign the pointer. Is this really necessary? */
327       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
328 
329       ierr = cudaFreeHost(AiLo);CHKERRCUDA(ierr);
330       ierr = cudaFreeHost(AjLo);CHKERRCUDA(ierr);
331       ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr);
332       ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
333     } catch(char *ex) {
334       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
335     }
336   }
337   PetscFunctionReturn(0);
338 }
339 
340 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
341 {
342   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
343   PetscInt                          n = A->rmap->n;
344   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
345   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
346   cusparseStatus_t                  stat;
347   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
348   const MatScalar                   *aa = a->a,*v;
349   PetscInt                          *AiUp, *AjUp;
350   PetscScalar                       *AAUp;
351   PetscInt                          i,nz, nzUpper, offset;
352   PetscErrorCode                    ierr;
353 
354   PetscFunctionBegin;
355   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
356     try {
357       /* next, figure out the number of nonzeros in the upper triangular matrix. */
358       nzUpper = adiag[0]-adiag[n];
359 
360       /* Allocate Space for the upper triangular matrix */
361       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
362       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr);
363       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
364 
365       /* Fill the upper triangular matrix */
366       AiUp[0]=(PetscInt) 0;
367       AiUp[n]=nzUpper;
368       offset = nzUpper;
369       for (i=n-1; i>=0; i--) {
370         v  = aa + adiag[i+1] + 1;
371         vi = aj + adiag[i+1] + 1;
372 
373         /* number of elements NOT on the diagonal */
374         nz = adiag[i] - adiag[i+1]-1;
375 
376         /* decrement the offset */
377         offset -= (nz+1);
378 
379         /* first, set the diagonal elements */
380         AjUp[offset] = (PetscInt) i;
381         AAUp[offset] = (MatScalar)1./v[nz];
382         AiUp[i]      = AiUp[i+1] - (nz+1);
383 
384         ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
385         ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
386       }
387 
388       /* allocate space for the triangular factor information */
389       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
390 
391       /* Create the matrix description */
392       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat);
393       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
394       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
395       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
396       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat);
397 
398       /* Create the solve analysis information */
399       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat);
400 
401       /* set the operation */
402       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
403 
404       /* set the matrix */
405       upTriFactor->csrMat = new CsrMatrix;
406       upTriFactor->csrMat->num_rows = n;
407       upTriFactor->csrMat->num_cols = n;
408       upTriFactor->csrMat->num_entries = nzUpper;
409 
410       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
411       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
412 
413       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
414       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
415 
416       upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
417       upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
418 
419       /* perform the solve analysis */
420       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
421                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
422                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
423                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat);
424 
425       /* assign the pointer. Is this really necessary? */
426       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
427 
428       ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr);
429       ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr);
430       ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr);
431       ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
432     } catch(char *ex) {
433       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
434     }
435   }
436   PetscFunctionReturn(0);
437 }
438 
439 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
440 {
441   PetscErrorCode               ierr;
442   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
443   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
444   IS                           isrow = a->row,iscol = a->icol;
445   PetscBool                    row_identity,col_identity;
446   const PetscInt               *r,*c;
447   PetscInt                     n = A->rmap->n;
448 
449   PetscFunctionBegin;
450   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
451   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
452 
453   cusparseTriFactors->workVector = new THRUSTARRAY(n);
454   cusparseTriFactors->nnz=a->nz;
455 
456   A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH;
457   /*lower triangular indices */
458   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
459   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
460   if (!row_identity) {
461     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
462     cusparseTriFactors->rpermIndices->assign(r, r+n);
463   }
464   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
465 
466   /*upper triangular indices */
467   ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
468   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
469   if (!col_identity) {
470     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
471     cusparseTriFactors->cpermIndices->assign(c, c+n);
472   }
473 
474   if(!row_identity && !col_identity) {
475     ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr);
476   } else if(!row_identity) {
477     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
478   } else if(!col_identity) {
479     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
480   }
481 
482   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
483   PetscFunctionReturn(0);
484 }
485 
486 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
487 {
488   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
489   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
490   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
491   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
492   cusparseStatus_t                  stat;
493   PetscErrorCode                    ierr;
494   PetscInt                          *AiUp, *AjUp;
495   PetscScalar                       *AAUp;
496   PetscScalar                       *AALo;
497   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
498   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
499   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
500   const MatScalar                   *aa = b->a,*v;
501 
502   PetscFunctionBegin;
503   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
504     try {
505       /* Allocate Space for the upper triangular matrix */
506       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
507       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr);
508       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
509       ierr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
510 
511       /* Fill the upper triangular matrix */
512       AiUp[0]=(PetscInt) 0;
513       AiUp[n]=nzUpper;
514       offset = 0;
515       for (i=0; i<n; i++) {
516         /* set the pointers */
517         v  = aa + ai[i];
518         vj = aj + ai[i];
519         nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
520 
521         /* first, set the diagonal elements */
522         AjUp[offset] = (PetscInt) i;
523         AAUp[offset] = (MatScalar)1.0/v[nz];
524         AiUp[i]      = offset;
525         AALo[offset] = (MatScalar)1.0/v[nz];
526 
527         offset+=1;
528         if (nz>0) {
529           ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
530           ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
531           for (j=offset; j<offset+nz; j++) {
532             AAUp[j] = -AAUp[j];
533             AALo[j] = AAUp[j]/v[nz];
534           }
535           offset+=nz;
536         }
537       }
538 
539       /* allocate space for the triangular factor information */
540       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
541 
542       /* Create the matrix description */
543       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat);
544       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
545       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
546       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
547       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat);
548 
549       /* Create the solve analysis information */
550       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat);
551 
552       /* set the operation */
553       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
554 
555       /* set the matrix */
556       upTriFactor->csrMat = new CsrMatrix;
557       upTriFactor->csrMat->num_rows = A->rmap->n;
558       upTriFactor->csrMat->num_cols = A->cmap->n;
559       upTriFactor->csrMat->num_entries = a->nz;
560 
561       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
562       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
563 
564       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
565       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
566 
567       upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
568       upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
569 
570       /* perform the solve analysis */
571       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
572                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
573                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
574                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat);
575 
576       /* assign the pointer. Is this really necessary? */
577       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
578 
579       /* allocate space for the triangular factor information */
580       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
581 
582       /* Create the matrix description */
583       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat);
584       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
585       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
586       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
587       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat);
588 
589       /* Create the solve analysis information */
590       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat);
591 
592       /* set the operation */
593       loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
594 
595       /* set the matrix */
596       loTriFactor->csrMat = new CsrMatrix;
597       loTriFactor->csrMat->num_rows = A->rmap->n;
598       loTriFactor->csrMat->num_cols = A->cmap->n;
599       loTriFactor->csrMat->num_entries = a->nz;
600 
601       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
602       loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
603 
604       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
605       loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
606 
607       loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
608       loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
609       ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
610 
611       /* perform the solve analysis */
612       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
613                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
614                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
615                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat);
616 
617       /* assign the pointer. Is this really necessary? */
618       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
619 
620       A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH;
621       ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr);
622       ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr);
623       ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr);
624       ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr);
625     } catch(char *ex) {
626       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
627     }
628   }
629   PetscFunctionReturn(0);
630 }
631 
632 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
633 {
634   PetscErrorCode               ierr;
635   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
636   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
637   IS                           ip = a->row;
638   const PetscInt               *rip;
639   PetscBool                    perm_identity;
640   PetscInt                     n = A->rmap->n;
641 
642   PetscFunctionBegin;
643   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
644   cusparseTriFactors->workVector = new THRUSTARRAY(n);
645   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
646 
647   /*lower triangular indices */
648   ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
649   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
650   if (!perm_identity) {
651     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
652     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
653     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
654     cusparseTriFactors->cpermIndices->assign(rip, rip+n);
655     ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr);
656   }
657   ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
658   PetscFunctionReturn(0);
659 }
660 
661 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
662 {
663   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
664   IS             isrow = b->row,iscol = b->col;
665   PetscBool      row_identity,col_identity;
666   PetscErrorCode ierr;
667 
668   PetscFunctionBegin;
669   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
670   /* determine which version of MatSolve needs to be used. */
671   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
672   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
673   if (row_identity && col_identity) {
674     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
675     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
676   } else {
677     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
678     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
679   }
680 
681   /* get the triangular factors */
682   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
683   PetscFunctionReturn(0);
684 }
685 
686 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
687 {
688   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
689   IS             ip = b->row;
690   PetscBool      perm_identity;
691   PetscErrorCode ierr;
692 
693   PetscFunctionBegin;
694   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
695 
696   /* determine which version of MatSolve needs to be used. */
697   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
698   if (perm_identity) {
699     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
700     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
701   } else {
702     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
703     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
704   }
705 
706   /* get the triangular factors */
707   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
708   PetscFunctionReturn(0);
709 }
710 
711 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
712 {
713   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
714   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
715   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
716   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
717   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
718   cusparseStatus_t                  stat;
719   cusparseIndexBase_t               indexBase;
720   cusparseMatrixType_t              matrixType;
721   cusparseFillMode_t                fillMode;
722   cusparseDiagType_t                diagType;
723 
724   PetscFunctionBegin;
725 
726   /*********************************************/
727   /* Now the Transpose of the Lower Tri Factor */
728   /*********************************************/
729 
730   /* allocate space for the transpose of the lower triangular factor */
731   loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
732 
733   /* set the matrix descriptors of the lower triangular factor */
734   matrixType = cusparseGetMatType(loTriFactor->descr);
735   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
736   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
737     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
738   diagType = cusparseGetMatDiagType(loTriFactor->descr);
739 
740   /* Create the matrix description */
741   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUDA(stat);
742   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUDA(stat);
743   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUDA(stat);
744   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUDA(stat);
745   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUDA(stat);
746 
747   /* Create the solve analysis information */
748   stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUDA(stat);
749 
750   /* set the operation */
751   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
752 
753   /* allocate GPU space for the CSC of the lower triangular factor*/
754   loTriFactorT->csrMat = new CsrMatrix;
755   loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows;
756   loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols;
757   loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
758   loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1);
759   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries);
760   loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries);
761 
762   /* compute the transpose of the lower triangular factor, i.e. the CSC */
763   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
764                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
765                           loTriFactor->csrMat->values->data().get(),
766                           loTriFactor->csrMat->row_offsets->data().get(),
767                           loTriFactor->csrMat->column_indices->data().get(),
768                           loTriFactorT->csrMat->values->data().get(),
769                           loTriFactorT->csrMat->column_indices->data().get(),
770                           loTriFactorT->csrMat->row_offsets->data().get(),
771                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
772 
773   /* perform the solve analysis on the transposed matrix */
774   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
775                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries,
776                            loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
777                            loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(),
778                            loTriFactorT->solveInfo);CHKERRCUDA(stat);
779 
780   /* assign the pointer. Is this really necessary? */
781   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
782 
783   /*********************************************/
784   /* Now the Transpose of the Upper Tri Factor */
785   /*********************************************/
786 
787   /* allocate space for the transpose of the upper triangular factor */
788   upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
789 
790   /* set the matrix descriptors of the upper triangular factor */
791   matrixType = cusparseGetMatType(upTriFactor->descr);
792   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
793   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
794     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
795   diagType = cusparseGetMatDiagType(upTriFactor->descr);
796 
797   /* Create the matrix description */
798   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUDA(stat);
799   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUDA(stat);
800   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUDA(stat);
801   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUDA(stat);
802   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUDA(stat);
803 
804   /* Create the solve analysis information */
805   stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUDA(stat);
806 
807   /* set the operation */
808   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
809 
810   /* allocate GPU space for the CSC of the upper triangular factor*/
811   upTriFactorT->csrMat = new CsrMatrix;
812   upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows;
813   upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols;
814   upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
815   upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1);
816   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries);
817   upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries);
818 
819   /* compute the transpose of the upper triangular factor, i.e. the CSC */
820   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
821                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
822                           upTriFactor->csrMat->values->data().get(),
823                           upTriFactor->csrMat->row_offsets->data().get(),
824                           upTriFactor->csrMat->column_indices->data().get(),
825                           upTriFactorT->csrMat->values->data().get(),
826                           upTriFactorT->csrMat->column_indices->data().get(),
827                           upTriFactorT->csrMat->row_offsets->data().get(),
828                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
829 
830   /* perform the solve analysis on the transposed matrix */
831   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
832                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries,
833                            upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
834                            upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(),
835                            upTriFactorT->solveInfo);CHKERRCUDA(stat);
836 
837   /* assign the pointer. Is this really necessary? */
838   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
839   PetscFunctionReturn(0);
840 }
841 
842 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
843 {
844   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
845   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
846   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
847   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
848   cusparseStatus_t             stat;
849   cusparseIndexBase_t          indexBase;
850   cudaError_t                  err;
851   PetscErrorCode               ierr;
852 
853   PetscFunctionBegin;
854 
855   /* allocate space for the triangular factor information */
856   matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
857   stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUDA(stat);
858   indexBase = cusparseGetMatIndexBase(matstruct->descr);
859   stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUDA(stat);
860   stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat);
861 
862   /* set alpha and beta */
863   err = cudaMalloc((void **)&(matstructT->alpha),    sizeof(PetscScalar));CHKERRCUDA(err);
864   err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
865   err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
866   err = cudaMemcpy(matstructT->alpha,    &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
867   err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
868   err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
869   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
870 
871   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
872     CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
873     CsrMatrix *matrixT= new CsrMatrix;
874     matrixT->num_rows = A->cmap->n;
875     matrixT->num_cols = A->rmap->n;
876     matrixT->num_entries = a->nz;
877     matrixT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
878     matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
879     matrixT->values = new THRUSTARRAY(a->nz);
880 
881     /* compute the transpose of the upper triangular factor, i.e. the CSC */
882     indexBase = cusparseGetMatIndexBase(matstruct->descr);
883     stat = cusparse_csr2csc(cusparsestruct->handle, matrix->num_rows,
884                             matrix->num_cols, matrix->num_entries,
885                             matrix->values->data().get(),
886                             matrix->row_offsets->data().get(),
887                             matrix->column_indices->data().get(),
888                             matrixT->values->data().get(),
889                             matrixT->column_indices->data().get(),
890                             matrixT->row_offsets->data().get(),
891                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
892 
893     /* assign the pointer */
894     matstructT->mat = matrixT;
895     ierr = PetscLogCpuToGpu(((A->rmap->n+1)+(a->nz))*sizeof(int)+(3+a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
896   } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
897 #if CUDA_VERSION>=5000
898     /* First convert HYB to CSR */
899     CsrMatrix *temp= new CsrMatrix;
900     temp->num_rows = A->rmap->n;
901     temp->num_cols = A->cmap->n;
902     temp->num_entries = a->nz;
903     temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
904     temp->column_indices = new THRUSTINTARRAY32(a->nz);
905     temp->values = new THRUSTARRAY(a->nz);
906 
907 
908     stat = cusparse_hyb2csr(cusparsestruct->handle,
909                             matstruct->descr, (cusparseHybMat_t)matstruct->mat,
910                             temp->values->data().get(),
911                             temp->row_offsets->data().get(),
912                             temp->column_indices->data().get());CHKERRCUDA(stat);
913 
914     /* Next, convert CSR to CSC (i.e. the matrix transpose) */
915     CsrMatrix *tempT= new CsrMatrix;
916     tempT->num_rows = A->rmap->n;
917     tempT->num_cols = A->cmap->n;
918     tempT->num_entries = a->nz;
919     tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
920     tempT->column_indices = new THRUSTINTARRAY32(a->nz);
921     tempT->values = new THRUSTARRAY(a->nz);
922 
923     stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
924                             temp->num_cols, temp->num_entries,
925                             temp->values->data().get(),
926                             temp->row_offsets->data().get(),
927                             temp->column_indices->data().get(),
928                             tempT->values->data().get(),
929                             tempT->column_indices->data().get(),
930                             tempT->row_offsets->data().get(),
931                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
932 
933     /* Last, convert CSC to HYB */
934     cusparseHybMat_t hybMat;
935     stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat);
936     cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
937       CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
938     stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
939                             matstructT->descr, tempT->values->data().get(),
940                             tempT->row_offsets->data().get(),
941                             tempT->column_indices->data().get(),
942                             hybMat, 0, partition);CHKERRCUDA(stat);
943 
944     /* assign the pointer */
945     matstructT->mat = hybMat;
946     ierr = PetscLogCpuToGpu((2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)))+3*sizeof(PetscScalar));CHKERRQ(ierr);
947 
948     /* delete temporaries */
949     if (tempT) {
950       if (tempT->values) delete (THRUSTARRAY*) tempT->values;
951       if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
952       if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
953       delete (CsrMatrix*) tempT;
954     }
955     if (temp) {
956       if (temp->values) delete (THRUSTARRAY*) temp->values;
957       if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
958       if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
959       delete (CsrMatrix*) temp;
960     }
961 #else
962     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format for the Matrix Transpose (in MatMultTranspose) require CUDA 5.0 or later.");
963 #endif
964   }
965   /* assign the compressed row indices */
966   matstructT->cprowIndices = new THRUSTINTARRAY;
967   matstructT->cprowIndices->resize(A->cmap->n);
968   thrust::sequence(matstructT->cprowIndices->begin(), matstructT->cprowIndices->end());
969   /* assign the pointer */
970   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
971   PetscFunctionReturn(0);
972 }
973 
974 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
975 {
976   PetscInt                              n = xx->map->n;
977   const PetscScalar                     *barray;
978   PetscScalar                           *xarray;
979   thrust::device_ptr<const PetscScalar> bGPU;
980   thrust::device_ptr<PetscScalar>       xGPU;
981   cusparseStatus_t                      stat;
982   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
983   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
984   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
985   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
986   PetscErrorCode                        ierr;
987 
988   PetscFunctionBegin;
989   /* Analyze the matrix and create the transpose ... on the fly */
990   if (!loTriFactorT && !upTriFactorT) {
991     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
992     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
993     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
994   }
995 
996   /* Get the GPU pointers */
997   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
998   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
999   xGPU = thrust::device_pointer_cast(xarray);
1000   bGPU = thrust::device_pointer_cast(barray);
1001 
1002   /* First, reorder with the row permutation */
1003   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1004                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1005                xGPU);
1006 
1007   /* First, solve U */
1008   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1009                         upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1010                         upTriFactorT->csrMat->values->data().get(),
1011                         upTriFactorT->csrMat->row_offsets->data().get(),
1012                         upTriFactorT->csrMat->column_indices->data().get(),
1013                         upTriFactorT->solveInfo,
1014                         xarray, tempGPU->data().get());CHKERRCUDA(stat);
1015 
1016   /* Then, solve L */
1017   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1018                         loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1019                         loTriFactorT->csrMat->values->data().get(),
1020                         loTriFactorT->csrMat->row_offsets->data().get(),
1021                         loTriFactorT->csrMat->column_indices->data().get(),
1022                         loTriFactorT->solveInfo,
1023                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1024 
1025   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1026   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1027                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1028                tempGPU->begin());
1029 
1030   /* Copy the temporary to the full solution. */
1031   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1032 
1033   /* restore */
1034   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1035   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1036   ierr = WaitForGPU();CHKERRCUDA(ierr);
1037 
1038   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1039   PetscFunctionReturn(0);
1040 }
1041 
1042 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1043 {
1044   const PetscScalar                 *barray;
1045   PetscScalar                       *xarray;
1046   cusparseStatus_t                  stat;
1047   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1048   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1049   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1050   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1051   PetscErrorCode                    ierr;
1052 
1053   PetscFunctionBegin;
1054   /* Analyze the matrix and create the transpose ... on the fly */
1055   if (!loTriFactorT && !upTriFactorT) {
1056     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1057     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1058     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1059   }
1060 
1061   /* Get the GPU pointers */
1062   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1063   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1064 
1065   /* First, solve U */
1066   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1067                         upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1068                         upTriFactorT->csrMat->values->data().get(),
1069                         upTriFactorT->csrMat->row_offsets->data().get(),
1070                         upTriFactorT->csrMat->column_indices->data().get(),
1071                         upTriFactorT->solveInfo,
1072                         barray, tempGPU->data().get());CHKERRCUDA(stat);
1073 
1074   /* Then, solve L */
1075   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1076                         loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1077                         loTriFactorT->csrMat->values->data().get(),
1078                         loTriFactorT->csrMat->row_offsets->data().get(),
1079                         loTriFactorT->csrMat->column_indices->data().get(),
1080                         loTriFactorT->solveInfo,
1081                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1082 
1083   /* restore */
1084   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1085   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1086   ierr = WaitForGPU();CHKERRCUDA(ierr);
1087   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1088   PetscFunctionReturn(0);
1089 }
1090 
1091 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1092 {
1093   const PetscScalar                     *barray;
1094   PetscScalar                           *xarray;
1095   thrust::device_ptr<const PetscScalar> bGPU;
1096   thrust::device_ptr<PetscScalar>       xGPU;
1097   cusparseStatus_t                      stat;
1098   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1099   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1100   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1101   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1102   PetscErrorCode                        ierr;
1103 
1104   PetscFunctionBegin;
1105 
1106   /* Get the GPU pointers */
1107   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1108   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1109   xGPU = thrust::device_pointer_cast(xarray);
1110   bGPU = thrust::device_pointer_cast(barray);
1111 
1112   /* First, reorder with the row permutation */
1113   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1114                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1115                xGPU);
1116 
1117   /* Next, solve L */
1118   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1119                         loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1120                         loTriFactor->csrMat->values->data().get(),
1121                         loTriFactor->csrMat->row_offsets->data().get(),
1122                         loTriFactor->csrMat->column_indices->data().get(),
1123                         loTriFactor->solveInfo,
1124                         xarray, tempGPU->data().get());CHKERRCUDA(stat);
1125 
1126   /* Then, solve U */
1127   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1128                         upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1129                         upTriFactor->csrMat->values->data().get(),
1130                         upTriFactor->csrMat->row_offsets->data().get(),
1131                         upTriFactor->csrMat->column_indices->data().get(),
1132                         upTriFactor->solveInfo,
1133                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1134 
1135   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1136   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1137                thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->end()),
1138                tempGPU->begin());
1139 
1140   /* Copy the temporary to the full solution. */
1141   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1142 
1143   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1144   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1145   ierr = WaitForGPU();CHKERRCUDA(ierr);
1146   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1147   PetscFunctionReturn(0);
1148 }
1149 
1150 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1151 {
1152   const PetscScalar                 *barray;
1153   PetscScalar                       *xarray;
1154   cusparseStatus_t                  stat;
1155   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1156   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1157   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1158   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1159   PetscErrorCode                    ierr;
1160 
1161   PetscFunctionBegin;
1162   /* Get the GPU pointers */
1163   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1164   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1165 
1166   /* First, solve L */
1167   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1168                         loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1169                         loTriFactor->csrMat->values->data().get(),
1170                         loTriFactor->csrMat->row_offsets->data().get(),
1171                         loTriFactor->csrMat->column_indices->data().get(),
1172                         loTriFactor->solveInfo,
1173                         barray, tempGPU->data().get());CHKERRCUDA(stat);
1174 
1175   /* Next, solve U */
1176   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1177                         upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1178                         upTriFactor->csrMat->values->data().get(),
1179                         upTriFactor->csrMat->row_offsets->data().get(),
1180                         upTriFactor->csrMat->column_indices->data().get(),
1181                         upTriFactor->solveInfo,
1182                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1183 
1184   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1185   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1186   ierr = WaitForGPU();CHKERRCUDA(ierr);
1187   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1188   PetscFunctionReturn(0);
1189 }
1190 
1191 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1192 {
1193 
1194   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1195   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1196   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1197   PetscInt                     m = A->rmap->n,*ii,*ridx;
1198   PetscErrorCode               ierr;
1199   cusparseStatus_t             stat;
1200   cudaError_t                  err;
1201 
1202   PetscFunctionBegin;
1203   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
1204     ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1205     if (A->assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) {
1206       CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
1207       /* copy values only */
1208       matrix->values->assign(a->a, a->a+a->nz);
1209       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
1210     } else {
1211       MatSeqAIJCUSPARSEMultStruct_Destroy(&matstruct,cusparsestruct->format);
1212       try {
1213         cusparsestruct->nonzerorow=0;
1214         for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0);
1215 
1216         if (a->compressedrow.use) {
1217           m    = a->compressedrow.nrows;
1218           ii   = a->compressedrow.i;
1219           ridx = a->compressedrow.rindex;
1220         } else {
1221           /* Forcing compressed row on the GPU */
1222           int k=0;
1223           ierr = PetscMalloc1(cusparsestruct->nonzerorow+1, &ii);CHKERRQ(ierr);
1224           ierr = PetscMalloc1(cusparsestruct->nonzerorow, &ridx);CHKERRQ(ierr);
1225           ii[0]=0;
1226           for (int j = 0; j<m; j++) {
1227             if ((a->i[j+1]-a->i[j])>0) {
1228               ii[k]  = a->i[j];
1229               ridx[k]= j;
1230               k++;
1231             }
1232           }
1233           ii[cusparsestruct->nonzerorow] = a->nz;
1234           m = cusparsestruct->nonzerorow;
1235         }
1236 
1237         /* allocate space for the triangular factor information */
1238         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1239         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUDA(stat);
1240         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
1241         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat);
1242 
1243         err = cudaMalloc((void **)&(matstruct->alpha),    sizeof(PetscScalar));CHKERRCUDA(err);
1244         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1245         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1246         err = cudaMemcpy(matstruct->alpha,    &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1247         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1248         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1249         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
1250 
1251         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1252         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1253           /* set the matrix */
1254           CsrMatrix *matrix= new CsrMatrix;
1255           matrix->num_rows = m;
1256           matrix->num_cols = A->cmap->n;
1257           matrix->num_entries = a->nz;
1258           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1259           matrix->row_offsets->assign(ii, ii + m+1);
1260 
1261           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1262           matrix->column_indices->assign(a->j, a->j+a->nz);
1263 
1264           matrix->values = new THRUSTARRAY(a->nz);
1265           matrix->values->assign(a->a, a->a+a->nz);
1266 
1267           /* assign the pointer */
1268           matstruct->mat = matrix;
1269 
1270         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1271 #if CUDA_VERSION>=4020
1272           CsrMatrix *matrix= new CsrMatrix;
1273           matrix->num_rows = m;
1274           matrix->num_cols = A->cmap->n;
1275           matrix->num_entries = a->nz;
1276           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1277           matrix->row_offsets->assign(ii, ii + m+1);
1278 
1279           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1280           matrix->column_indices->assign(a->j, a->j+a->nz);
1281 
1282           matrix->values = new THRUSTARRAY(a->nz);
1283           matrix->values->assign(a->a, a->a+a->nz);
1284 
1285           cusparseHybMat_t hybMat;
1286           stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat);
1287           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1288             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1289           stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols,
1290               matstruct->descr, matrix->values->data().get(),
1291               matrix->row_offsets->data().get(),
1292               matrix->column_indices->data().get(),
1293               hybMat, 0, partition);CHKERRCUDA(stat);
1294           /* assign the pointer */
1295           matstruct->mat = hybMat;
1296 
1297           if (matrix) {
1298             if (matrix->values) delete (THRUSTARRAY*)matrix->values;
1299             if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices;
1300             if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets;
1301             delete (CsrMatrix*)matrix;
1302           }
1303 #endif
1304         }
1305 
1306         /* assign the compressed row indices */
1307         matstruct->cprowIndices = new THRUSTINTARRAY(m);
1308         matstruct->cprowIndices->assign(ridx,ridx+m);
1309         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+m*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
1310 
1311         /* assign the pointer */
1312         cusparsestruct->mat = matstruct;
1313 
1314         if (!a->compressedrow.use) {
1315           ierr = PetscFree(ii);CHKERRQ(ierr);
1316           ierr = PetscFree(ridx);CHKERRQ(ierr);
1317         }
1318         cusparsestruct->workVector = new THRUSTARRAY(m);
1319       } catch(char *ex) {
1320         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1321       }
1322       cusparsestruct->nonzerostate = A->nonzerostate;
1323     }
1324     ierr = WaitForGPU();CHKERRCUDA(ierr);
1325     A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH;
1326     ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1327   }
1328   PetscFunctionReturn(0);
1329 }
1330 
1331 struct VecCUDAPlusEquals
1332 {
1333   template <typename Tuple>
1334   __host__ __device__
1335   void operator()(Tuple t)
1336   {
1337     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1338   }
1339 };
1340 
1341 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1342 {
1343   PetscErrorCode ierr;
1344 
1345   PetscFunctionBegin;
1346   ierr = MatMultAdd_SeqAIJCUSPARSE(A,xx,NULL,yy);CHKERRQ(ierr);
1347   PetscFunctionReturn(0);
1348 }
1349 
1350 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1351 {
1352   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1353   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1354   Mat_SeqAIJCUSPARSEMultStruct *matstructT;
1355   const PetscScalar            *xarray;
1356   PetscScalar                  *yarray;
1357   PetscErrorCode               ierr;
1358   cusparseStatus_t             stat;
1359 
1360   PetscFunctionBegin;
1361   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1362   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1363   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1364   if (!matstructT) {
1365     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1366     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1367   }
1368   ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1369   ierr = VecSet(yy,0);CHKERRQ(ierr);
1370   ierr = VecCUDAGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
1371 
1372   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1373     CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1374     stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1375                              mat->num_rows, mat->num_cols,
1376                              mat->num_entries, matstructT->alpha, matstructT->descr,
1377                              mat->values->data().get(), mat->row_offsets->data().get(),
1378                              mat->column_indices->data().get(), xarray, matstructT->beta_zero,
1379                              yarray);CHKERRCUDA(stat);
1380   } else {
1381 #if CUDA_VERSION>=4020
1382     cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1383     stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1384                              matstructT->alpha, matstructT->descr, hybMat,
1385                              xarray, matstructT->beta_zero,
1386                              yarray);CHKERRCUDA(stat);
1387 #endif
1388   }
1389   ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1390   ierr = VecCUDARestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
1391   if (!cusparsestruct->stream) {
1392     ierr = WaitForGPU();CHKERRCUDA(ierr);
1393   }
1394   ierr = PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr);
1395   PetscFunctionReturn(0);
1396 }
1397 
1398 
1399 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1400 {
1401   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1402   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1403   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
1404   const PetscScalar            *xarray;
1405   PetscScalar                  *zarray,*dptr,*beta;
1406   PetscErrorCode               ierr;
1407   cusparseStatus_t             stat;
1408 
1409   PetscFunctionBegin;
1410   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1411   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1412   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1413   try {
1414     ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1415     ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);
1416     dptr = cusparsestruct->workVector->size() == (thrust::detail::vector_base<PetscScalar, thrust::device_malloc_allocator<PetscScalar> >::size_type)(A->rmap->n) ? zarray : cusparsestruct->workVector->data().get();
1417     beta = (yy == zz && dptr == zarray) ? matstruct->beta_one : matstruct->beta_zero;
1418 
1419     /* csr_spmv is multiply add */
1420     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1421       /* here we need to be careful to set the number of rows in the multiply to the
1422          number of compressed rows in the matrix ... which is equivalent to the
1423          size of the workVector */
1424       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1425       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1426                                mat->num_rows, mat->num_cols,
1427                                mat->num_entries, matstruct->alpha, matstruct->descr,
1428                                mat->values->data().get(), mat->row_offsets->data().get(),
1429                                mat->column_indices->data().get(), xarray, beta,
1430                                dptr);CHKERRCUDA(stat);
1431     } else {
1432 #if CUDA_VERSION>=4020
1433       cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1434       if (cusparsestruct->workVector->size()) {
1435         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1436                                  matstruct->alpha, matstruct->descr, hybMat,
1437                                  xarray, beta,
1438                                  dptr);CHKERRCUDA(stat);
1439       }
1440 #endif
1441     }
1442 
1443     if (yy) {
1444       if (dptr != zarray) {
1445         ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);
1446       } else if (zz != yy) {
1447         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr);
1448       }
1449     } else if (dptr != zarray) {
1450       ierr = VecSet(zz,0);CHKERRQ(ierr);
1451     }
1452     /* scatter the data from the temporary into the full vector with a += operation */
1453     if (dptr != zarray) {
1454       thrust::device_ptr<PetscScalar> zptr;
1455 
1456       zptr = thrust::device_pointer_cast(zarray);
1457       thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
1458                        thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(),
1459                        VecCUDAPlusEquals());
1460     }
1461     ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1462     ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);
1463   } catch(char *ex) {
1464     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1465   }
1466   if (!yy) { /* MatMult */
1467     if (!cusparsestruct->stream) {
1468       ierr = WaitForGPU();CHKERRCUDA(ierr);
1469     }
1470   }
1471   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1472   PetscFunctionReturn(0);
1473 }
1474 
1475 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1476 {
1477   Mat_SeqAIJ                      *a = (Mat_SeqAIJ*)A->data;
1478   Mat_SeqAIJCUSPARSE              *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1479   Mat_SeqAIJCUSPARSEMultStruct    *matstructT;
1480   thrust::device_ptr<PetscScalar> zptr;
1481   const PetscScalar               *xarray;
1482   PetscScalar                     *zarray;
1483   PetscErrorCode                  ierr;
1484   cusparseStatus_t                stat;
1485 
1486   PetscFunctionBegin;
1487   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1488   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1489   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1490   if (!matstructT) {
1491     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1492     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1493   }
1494 
1495   try {
1496     ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);
1497     ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1498     ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);
1499     zptr = thrust::device_pointer_cast(zarray);
1500 
1501     /* multiply add with matrix transpose */
1502     if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1503       CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1504       /* here we need to be careful to set the number of rows in the multiply to the
1505          number of compressed rows in the matrix ... which is equivalent to the
1506          size of the workVector */
1507       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1508                                mat->num_rows, mat->num_cols,
1509                                mat->num_entries, matstructT->alpha, matstructT->descr,
1510                                mat->values->data().get(), mat->row_offsets->data().get(),
1511                                mat->column_indices->data().get(), xarray, matstructT->beta_zero,
1512                                cusparsestruct->workVector->data().get());CHKERRCUDA(stat);
1513     } else {
1514 #if CUDA_VERSION>=4020
1515       cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1516       if (cusparsestruct->workVector->size()) {
1517         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1518             matstructT->alpha, matstructT->descr, hybMat,
1519             xarray, matstructT->beta_zero,
1520             cusparsestruct->workVector->data().get());CHKERRCUDA(stat);
1521       }
1522 #endif
1523     }
1524 
1525     /* scatter the data from the temporary into the full vector with a += operation */
1526     thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))),
1527         thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))) + A->cmap->n,
1528         VecCUDAPlusEquals());
1529 
1530     ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1531     ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);
1532 
1533   } catch(char *ex) {
1534     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1535   }
1536   ierr = WaitForGPU();CHKERRCUDA(ierr);
1537   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1538   PetscFunctionReturn(0);
1539 }
1540 
1541 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
1542 {
1543   PetscErrorCode ierr;
1544 
1545   PetscFunctionBegin;
1546   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
1547   if (A->factortype==MAT_FACTOR_NONE) {
1548     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1549   }
1550   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
1551   A->ops->mult             = MatMult_SeqAIJCUSPARSE;
1552   A->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1553   A->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1554   A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1555   PetscFunctionReturn(0);
1556 }
1557 
1558 /* --------------------------------------------------------------------------------*/
1559 /*@
1560    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
1561    (the default parallel PETSc format). This matrix will ultimately pushed down
1562    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
1563    assembly performance the user should preallocate the matrix storage by setting
1564    the parameter nz (or the array nnz).  By setting these parameters accurately,
1565    performance during matrix assembly can be increased by more than a factor of 50.
1566 
1567    Collective
1568 
1569    Input Parameters:
1570 +  comm - MPI communicator, set to PETSC_COMM_SELF
1571 .  m - number of rows
1572 .  n - number of columns
1573 .  nz - number of nonzeros per row (same for all rows)
1574 -  nnz - array containing the number of nonzeros in the various rows
1575          (possibly different for each row) or NULL
1576 
1577    Output Parameter:
1578 .  A - the matrix
1579 
1580    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1581    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1582    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
1583 
1584    Notes:
1585    If nnz is given then nz is ignored
1586 
1587    The AIJ format (also called the Yale sparse matrix format or
1588    compressed row storage), is fully compatible with standard Fortran 77
1589    storage.  That is, the stored row and column indices can begin at
1590    either one (as in Fortran) or zero.  See the users' manual for details.
1591 
1592    Specify the preallocated storage with either nz or nnz (not both).
1593    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1594    allocation.  For large problems you MUST preallocate memory or you
1595    will get TERRIBLE performance, see the users' manual chapter on matrices.
1596 
1597    By default, this format uses inodes (identical nodes) when possible, to
1598    improve numerical efficiency of matrix-vector products and solves. We
1599    search for consecutive rows with the same nonzero structure, thereby
1600    reusing matrix information to achieve increased efficiency.
1601 
1602    Level: intermediate
1603 
1604 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
1605 @*/
1606 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1607 {
1608   PetscErrorCode ierr;
1609 
1610   PetscFunctionBegin;
1611   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1612   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
1613   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1614   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
1615   PetscFunctionReturn(0);
1616 }
1617 
1618 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
1619 {
1620   PetscErrorCode   ierr;
1621 
1622   PetscFunctionBegin;
1623   if (A->factortype==MAT_FACTOR_NONE) {
1624     if (A->valid_GPU_matrix != PETSC_OFFLOAD_UNALLOCATED) {
1625       ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
1626     }
1627   } else {
1628     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
1629   }
1630   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
1631   PetscFunctionReturn(0);
1632 }
1633 
1634 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
1635 {
1636   PetscErrorCode ierr;
1637   Mat C;
1638   cusparseStatus_t stat;
1639   cusparseHandle_t handle=0;
1640 
1641   PetscFunctionBegin;
1642   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
1643   C    = *B;
1644   ierr = PetscFree(C->defaultvectype);CHKERRQ(ierr);
1645   ierr = PetscStrallocpy(VECCUDA,&C->defaultvectype);CHKERRQ(ierr);
1646 
1647   /* inject CUSPARSE-specific stuff */
1648   if (C->factortype==MAT_FACTOR_NONE) {
1649     /* you cannot check the inode.use flag here since the matrix was just created.
1650        now build a GPU matrix data structure */
1651     C->spptr = new Mat_SeqAIJCUSPARSE;
1652     ((Mat_SeqAIJCUSPARSE*)C->spptr)->mat          = 0;
1653     ((Mat_SeqAIJCUSPARSE*)C->spptr)->matTranspose = 0;
1654     ((Mat_SeqAIJCUSPARSE*)C->spptr)->workVector   = 0;
1655     ((Mat_SeqAIJCUSPARSE*)C->spptr)->format       = MAT_CUSPARSE_CSR;
1656     ((Mat_SeqAIJCUSPARSE*)C->spptr)->stream       = 0;
1657     ((Mat_SeqAIJCUSPARSE*)C->spptr)->handle       = 0;
1658     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1659     ((Mat_SeqAIJCUSPARSE*)C->spptr)->handle       = handle;
1660     ((Mat_SeqAIJCUSPARSE*)C->spptr)->stream       = 0;
1661     ((Mat_SeqAIJCUSPARSE*)C->spptr)->nonzerostate = 0;
1662   } else {
1663     /* NEXT, set the pointers to the triangular factors */
1664     C->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1665     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtr          = 0;
1666     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtr          = 0;
1667     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtrTranspose = 0;
1668     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtrTranspose = 0;
1669     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->rpermIndices            = 0;
1670     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->cpermIndices            = 0;
1671     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->workVector              = 0;
1672     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle                  = 0;
1673     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1674     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle                  = handle;
1675     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->nnz                     = 0;
1676   }
1677 
1678   C->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
1679   C->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
1680   C->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
1681   C->ops->mult             = MatMult_SeqAIJCUSPARSE;
1682   C->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1683   C->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1684   C->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1685   C->ops->duplicate        = MatDuplicate_SeqAIJCUSPARSE;
1686 
1687   ierr = PetscObjectChangeTypeName((PetscObject)C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1688 
1689   C->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED;
1690 
1691   ierr = PetscObjectComposeFunction((PetscObject)C, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
1692   PetscFunctionReturn(0);
1693 }
1694 
1695 PETSC_EXTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat B)
1696 {
1697   PetscErrorCode ierr;
1698   cusparseStatus_t stat;
1699   cusparseHandle_t handle=0;
1700 
1701   PetscFunctionBegin;
1702   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
1703   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
1704 
1705   if (B->factortype==MAT_FACTOR_NONE) {
1706     /* you cannot check the inode.use flag here since the matrix was just created.
1707        now build a GPU matrix data structure */
1708     B->spptr = new Mat_SeqAIJCUSPARSE;
1709     ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat          = 0;
1710     ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0;
1711     ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector   = 0;
1712     ((Mat_SeqAIJCUSPARSE*)B->spptr)->format       = MAT_CUSPARSE_CSR;
1713     ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream       = 0;
1714     ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle       = 0;
1715     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1716     ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle       = handle;
1717     ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream       = 0;
1718     ((Mat_SeqAIJCUSPARSE*)B->spptr)->nonzerostate = 0;
1719   } else {
1720     /* NEXT, set the pointers to the triangular factors */
1721     B->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1722     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr          = 0;
1723     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr          = 0;
1724     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0;
1725     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0;
1726     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices            = 0;
1727     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices            = 0;
1728     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector              = 0;
1729     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle                  = 0;
1730     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1731     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle                  = handle;
1732     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz                     = 0;
1733   }
1734 
1735   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
1736   B->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
1737   B->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
1738   B->ops->mult             = MatMult_SeqAIJCUSPARSE;
1739   B->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1740   B->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1741   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1742   B->ops->duplicate        = MatDuplicate_SeqAIJCUSPARSE;
1743 
1744   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1745 
1746   B->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED;
1747 
1748   ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
1749   PetscFunctionReturn(0);
1750 }
1751 
1752 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
1753 {
1754   PetscErrorCode ierr;
1755 
1756   PetscFunctionBegin;
1757   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
1758   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B);
1759   PetscFunctionReturn(0);
1760 }
1761 
1762 /*MC
1763    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
1764 
1765    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
1766    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
1767    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
1768 
1769    Options Database Keys:
1770 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
1771 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1772 .  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1773 
1774   Level: beginner
1775 
1776 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
1777 M*/
1778 
1779 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);
1780 
1781 
1782 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
1783 {
1784   PetscErrorCode ierr;
1785 
1786   PetscFunctionBegin;
1787   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1788   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1789   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1790   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1791   PetscFunctionReturn(0);
1792 }
1793 
1794 
1795 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
1796 {
1797   cusparseStatus_t stat;
1798   cusparseHandle_t handle;
1799 
1800   PetscFunctionBegin;
1801   if (*cusparsestruct) {
1802     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);
1803     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);
1804     delete (*cusparsestruct)->workVector;
1805     if (handle = (*cusparsestruct)->handle) {
1806       stat = cusparseDestroy(handle);CHKERRCUDA(stat);
1807     }
1808     delete *cusparsestruct;
1809     *cusparsestruct = 0;
1810   }
1811   PetscFunctionReturn(0);
1812 }
1813 
1814 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
1815 {
1816   PetscFunctionBegin;
1817   if (*mat) {
1818     delete (*mat)->values;
1819     delete (*mat)->column_indices;
1820     delete (*mat)->row_offsets;
1821     delete *mat;
1822     *mat = 0;
1823   }
1824   PetscFunctionReturn(0);
1825 }
1826 
1827 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
1828 {
1829   cusparseStatus_t stat;
1830   PetscErrorCode   ierr;
1831 
1832   PetscFunctionBegin;
1833   if (*trifactor) {
1834     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUDA(stat); }
1835     if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUDA(stat); }
1836     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
1837     delete *trifactor;
1838     *trifactor = 0;
1839   }
1840   PetscFunctionReturn(0);
1841 }
1842 
1843 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
1844 {
1845   CsrMatrix        *mat;
1846   cusparseStatus_t stat;
1847   cudaError_t      err;
1848 
1849   PetscFunctionBegin;
1850   if (*matstruct) {
1851     if ((*matstruct)->mat) {
1852       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
1853         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
1854         stat = cusparseDestroyHybMat(hybMat);CHKERRCUDA(stat);
1855       } else {
1856         mat = (CsrMatrix*)(*matstruct)->mat;
1857         CsrMatrix_Destroy(&mat);
1858       }
1859     }
1860     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUDA(stat); }
1861     delete (*matstruct)->cprowIndices;
1862     if ((*matstruct)->alpha)     { err=cudaFree((*matstruct)->alpha);CHKERRCUDA(err); }
1863     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
1864     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
1865     delete *matstruct;
1866     *matstruct = 0;
1867   }
1868   PetscFunctionReturn(0);
1869 }
1870 
1871 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
1872 {
1873   cusparseHandle_t handle;
1874   cusparseStatus_t stat;
1875 
1876   PetscFunctionBegin;
1877   if (*trifactors) {
1878     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);
1879     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);
1880     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);
1881     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);
1882     delete (*trifactors)->rpermIndices;
1883     delete (*trifactors)->cpermIndices;
1884     delete (*trifactors)->workVector;
1885     if (handle = (*trifactors)->handle) {
1886       stat = cusparseDestroy(handle);CHKERRCUDA(stat);
1887     }
1888     delete *trifactors;
1889     *trifactors = 0;
1890   }
1891   PetscFunctionReturn(0);
1892 }
1893 
1894