xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 4211eb483909a9a6403702befc20456a00a91081)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 
7 #include <petscconf.h>
8 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
9 #include <../src/mat/impls/sbaij/seq/sbaij.h>
10 #include <../src/vec/vec/impls/dvecimpl.h>
11 #include <petsc/private/vecimpl.h>
12 #undef VecType
13 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
14 
15 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
16 
17 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
18 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
19 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
20 
21 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
22 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
23 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
24 
25 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
26 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
27 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
28 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
29 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
30 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
31 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
32 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
33 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
34 
35 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
36 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
37 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
38 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
39 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
40 
41 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
42 {
43   cusparseStatus_t   stat;
44   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
45 
46   PetscFunctionBegin;
47   cusparsestruct->stream = stream;
48   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUDA(stat);
49   PetscFunctionReturn(0);
50 }
51 
52 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
53 {
54   cusparseStatus_t   stat;
55   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
56 
57   PetscFunctionBegin;
58   if (cusparsestruct->handle != handle) {
59     if (cusparsestruct->handle) {
60       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUDA(stat);
61     }
62     cusparsestruct->handle = handle;
63   }
64   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
65   PetscFunctionReturn(0);
66 }
67 
68 PetscErrorCode MatCUSPARSEClearHandle(Mat A)
69 {
70   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
71   PetscFunctionBegin;
72   if (cusparsestruct->handle)
73     cusparsestruct->handle = 0;
74   PetscFunctionReturn(0);
75 }
76 
77 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
78 {
79   PetscFunctionBegin;
80   *type = MATSOLVERCUSPARSE;
81   PetscFunctionReturn(0);
82 }
83 
84 /*MC
85   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
86   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
87   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
88   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
89   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
90   algorithms are not recommended. This class does NOT support direct solver operations.
91 
92   Level: beginner
93 
94 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
95 M*/
96 
97 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
98 {
99   PetscErrorCode ierr;
100   PetscInt       n = A->rmap->n;
101 
102   PetscFunctionBegin;
103   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
104   (*B)->factortype = ftype;
105   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
106   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
107 
108   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
109     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
110     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
111     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
112   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
113     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
114     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
115   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
116 
117   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
118   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
119   PetscFunctionReturn(0);
120 }
121 
122 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
123 {
124   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
125 
126   PetscFunctionBegin;
127 #if CUDA_VERSION>=4020
128   switch (op) {
129   case MAT_CUSPARSE_MULT:
130     cusparsestruct->format = format;
131     break;
132   case MAT_CUSPARSE_ALL:
133     cusparsestruct->format = format;
134     break;
135   default:
136     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
137   }
138 #else
139   if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format require CUDA 4.2 or later.");
140 #endif
141   PetscFunctionReturn(0);
142 }
143 
144 /*@
145    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
146    operation. Only the MatMult operation can use different GPU storage formats
147    for MPIAIJCUSPARSE matrices.
148    Not Collective
149 
150    Input Parameters:
151 +  A - Matrix of type SEQAIJCUSPARSE
152 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
153 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
154 
155    Output Parameter:
156 
157    Level: intermediate
158 
159 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
160 @*/
161 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
162 {
163   PetscErrorCode ierr;
164 
165   PetscFunctionBegin;
166   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
167   ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
168   PetscFunctionReturn(0);
169 }
170 
171 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
172 {
173   PetscErrorCode           ierr;
174   MatCUSPARSEStorageFormat format;
175   PetscBool                flg;
176   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
177 
178   PetscFunctionBegin;
179   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
180   if (A->factortype==MAT_FACTOR_NONE) {
181     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
182                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
183     if (flg) {
184       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);
185     }
186   }
187   ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
188                           "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
189   if (flg) {
190     ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
191   }
192   ierr = PetscOptionsTail();CHKERRQ(ierr);
193   PetscFunctionReturn(0);
194 
195 }
196 
197 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
198 {
199   PetscErrorCode ierr;
200 
201   PetscFunctionBegin;
202   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
203   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
204   PetscFunctionReturn(0);
205 }
206 
207 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
208 {
209   PetscErrorCode ierr;
210 
211   PetscFunctionBegin;
212   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
213   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
214   PetscFunctionReturn(0);
215 }
216 
217 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
218 {
219   PetscErrorCode ierr;
220 
221   PetscFunctionBegin;
222   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
223   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
224   PetscFunctionReturn(0);
225 }
226 
227 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
228 {
229   PetscErrorCode ierr;
230 
231   PetscFunctionBegin;
232   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
233   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
234   PetscFunctionReturn(0);
235 }
236 
237 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
238 {
239   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
240   PetscInt                          n = A->rmap->n;
241   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
242   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
243   cusparseStatus_t                  stat;
244   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
245   const MatScalar                   *aa = a->a,*v;
246   PetscInt                          *AiLo, *AjLo;
247   PetscScalar                       *AALo;
248   PetscInt                          i,nz, nzLower, offset, rowOffset;
249   PetscErrorCode                    ierr;
250 
251   PetscFunctionBegin;
252   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
253     try {
254       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
255       nzLower=n+ai[n]-ai[1];
256 
257       /* Allocate Space for the lower triangular matrix */
258       ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
259       ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(ierr);
260       ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(ierr);
261 
262       /* Fill the lower triangular matrix */
263       AiLo[0]  = (PetscInt) 0;
264       AiLo[n]  = nzLower;
265       AjLo[0]  = (PetscInt) 0;
266       AALo[0]  = (MatScalar) 1.0;
267       v        = aa;
268       vi       = aj;
269       offset   = 1;
270       rowOffset= 1;
271       for (i=1; i<n; i++) {
272         nz = ai[i+1] - ai[i];
273         /* additional 1 for the term on the diagonal */
274         AiLo[i]    = rowOffset;
275         rowOffset += nz+1;
276 
277         ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
278         ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
279 
280         offset      += nz;
281         AjLo[offset] = (PetscInt) i;
282         AALo[offset] = (MatScalar) 1.0;
283         offset      += 1;
284 
285         v  += nz;
286         vi += nz;
287       }
288 
289       /* allocate space for the triangular factor information */
290       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
291 
292       /* Create the matrix description */
293       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat);
294       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
295       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
296       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUDA(stat);
297       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat);
298 
299       /* Create the solve analysis information */
300       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat);
301 
302       /* set the operation */
303       loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
304 
305       /* set the matrix */
306       loTriFactor->csrMat = new CsrMatrix;
307       loTriFactor->csrMat->num_rows = n;
308       loTriFactor->csrMat->num_cols = n;
309       loTriFactor->csrMat->num_entries = nzLower;
310 
311       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
312       loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
313 
314       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
315       loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
316 
317       loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
318       loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
319 
320       /* perform the solve analysis */
321       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
322                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
323                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
324                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat);
325 
326       /* assign the pointer. Is this really necessary? */
327       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
328 
329       ierr = cudaFreeHost(AiLo);CHKERRCUDA(ierr);
330       ierr = cudaFreeHost(AjLo);CHKERRCUDA(ierr);
331       ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr);
332       ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
333     } catch(char *ex) {
334       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
335     }
336   }
337   PetscFunctionReturn(0);
338 }
339 
340 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
341 {
342   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
343   PetscInt                          n = A->rmap->n;
344   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
345   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
346   cusparseStatus_t                  stat;
347   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
348   const MatScalar                   *aa = a->a,*v;
349   PetscInt                          *AiUp, *AjUp;
350   PetscScalar                       *AAUp;
351   PetscInt                          i,nz, nzUpper, offset;
352   PetscErrorCode                    ierr;
353 
354   PetscFunctionBegin;
355   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
356     try {
357       /* next, figure out the number of nonzeros in the upper triangular matrix. */
358       nzUpper = adiag[0]-adiag[n];
359 
360       /* Allocate Space for the upper triangular matrix */
361       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
362       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr);
363       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
364 
365       /* Fill the upper triangular matrix */
366       AiUp[0]=(PetscInt) 0;
367       AiUp[n]=nzUpper;
368       offset = nzUpper;
369       for (i=n-1; i>=0; i--) {
370         v  = aa + adiag[i+1] + 1;
371         vi = aj + adiag[i+1] + 1;
372 
373         /* number of elements NOT on the diagonal */
374         nz = adiag[i] - adiag[i+1]-1;
375 
376         /* decrement the offset */
377         offset -= (nz+1);
378 
379         /* first, set the diagonal elements */
380         AjUp[offset] = (PetscInt) i;
381         AAUp[offset] = (MatScalar)1./v[nz];
382         AiUp[i]      = AiUp[i+1] - (nz+1);
383 
384         ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
385         ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
386       }
387 
388       /* allocate space for the triangular factor information */
389       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
390 
391       /* Create the matrix description */
392       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat);
393       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
394       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
395       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
396       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat);
397 
398       /* Create the solve analysis information */
399       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat);
400 
401       /* set the operation */
402       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
403 
404       /* set the matrix */
405       upTriFactor->csrMat = new CsrMatrix;
406       upTriFactor->csrMat->num_rows = n;
407       upTriFactor->csrMat->num_cols = n;
408       upTriFactor->csrMat->num_entries = nzUpper;
409 
410       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
411       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
412 
413       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
414       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
415 
416       upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
417       upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
418 
419       /* perform the solve analysis */
420       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
421                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
422                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
423                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat);
424 
425       /* assign the pointer. Is this really necessary? */
426       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
427 
428       ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr);
429       ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr);
430       ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr);
431       ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
432     } catch(char *ex) {
433       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
434     }
435   }
436   PetscFunctionReturn(0);
437 }
438 
439 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
440 {
441   PetscErrorCode               ierr;
442   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
443   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
444   IS                           isrow = a->row,iscol = a->icol;
445   PetscBool                    row_identity,col_identity;
446   const PetscInt               *r,*c;
447   PetscInt                     n = A->rmap->n;
448 
449   PetscFunctionBegin;
450   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
451   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
452 
453   cusparseTriFactors->workVector = new THRUSTARRAY(n);
454   cusparseTriFactors->nnz=a->nz;
455 
456   A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH;
457   /*lower triangular indices */
458   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
459   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
460   if (!row_identity) {
461     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
462     cusparseTriFactors->rpermIndices->assign(r, r+n);
463   }
464   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
465 
466   /*upper triangular indices */
467   ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
468   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
469   if (!col_identity) {
470     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
471     cusparseTriFactors->cpermIndices->assign(c, c+n);
472   }
473 
474   if(!row_identity && !col_identity) {
475     ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr);
476   } else if(!row_identity) {
477     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
478   } else if(!col_identity) {
479     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
480   }
481 
482   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
483   PetscFunctionReturn(0);
484 }
485 
486 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
487 {
488   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
489   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
490   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
491   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
492   cusparseStatus_t                  stat;
493   PetscErrorCode                    ierr;
494   PetscInt                          *AiUp, *AjUp;
495   PetscScalar                       *AAUp;
496   PetscScalar                       *AALo;
497   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
498   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
499   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
500   const MatScalar                   *aa = b->a,*v;
501 
502   PetscFunctionBegin;
503   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
504     try {
505       /* Allocate Space for the upper triangular matrix */
506       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
507       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr);
508       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
509       ierr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
510 
511       /* Fill the upper triangular matrix */
512       AiUp[0]=(PetscInt) 0;
513       AiUp[n]=nzUpper;
514       offset = 0;
515       for (i=0; i<n; i++) {
516         /* set the pointers */
517         v  = aa + ai[i];
518         vj = aj + ai[i];
519         nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
520 
521         /* first, set the diagonal elements */
522         AjUp[offset] = (PetscInt) i;
523         AAUp[offset] = (MatScalar)1.0/v[nz];
524         AiUp[i]      = offset;
525         AALo[offset] = (MatScalar)1.0/v[nz];
526 
527         offset+=1;
528         if (nz>0) {
529           ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
530           ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
531           for (j=offset; j<offset+nz; j++) {
532             AAUp[j] = -AAUp[j];
533             AALo[j] = AAUp[j]/v[nz];
534           }
535           offset+=nz;
536         }
537       }
538 
539       /* allocate space for the triangular factor information */
540       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
541 
542       /* Create the matrix description */
543       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat);
544       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
545       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
546       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
547       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat);
548 
549       /* Create the solve analysis information */
550       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat);
551 
552       /* set the operation */
553       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
554 
555       /* set the matrix */
556       upTriFactor->csrMat = new CsrMatrix;
557       upTriFactor->csrMat->num_rows = A->rmap->n;
558       upTriFactor->csrMat->num_cols = A->cmap->n;
559       upTriFactor->csrMat->num_entries = a->nz;
560 
561       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
562       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
563 
564       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
565       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
566 
567       upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
568       upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
569 
570       /* perform the solve analysis */
571       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
572                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
573                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
574                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat);
575 
576       /* assign the pointer. Is this really necessary? */
577       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
578 
579       /* allocate space for the triangular factor information */
580       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
581 
582       /* Create the matrix description */
583       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat);
584       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
585       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
586       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
587       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat);
588 
589       /* Create the solve analysis information */
590       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat);
591 
592       /* set the operation */
593       loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
594 
595       /* set the matrix */
596       loTriFactor->csrMat = new CsrMatrix;
597       loTriFactor->csrMat->num_rows = A->rmap->n;
598       loTriFactor->csrMat->num_cols = A->cmap->n;
599       loTriFactor->csrMat->num_entries = a->nz;
600 
601       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
602       loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
603 
604       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
605       loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
606 
607       loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
608       loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
609       ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
610 
611       /* perform the solve analysis */
612       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
613                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
614                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
615                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat);
616 
617       /* assign the pointer. Is this really necessary? */
618       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
619 
620       A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH;
621       ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr);
622       ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr);
623       ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr);
624       ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr);
625     } catch(char *ex) {
626       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
627     }
628   }
629   PetscFunctionReturn(0);
630 }
631 
632 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
633 {
634   PetscErrorCode               ierr;
635   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
636   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
637   IS                           ip = a->row;
638   const PetscInt               *rip;
639   PetscBool                    perm_identity;
640   PetscInt                     n = A->rmap->n;
641 
642   PetscFunctionBegin;
643   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
644   cusparseTriFactors->workVector = new THRUSTARRAY(n);
645   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
646 
647   /*lower triangular indices */
648   ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
649   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
650   if (!perm_identity) {
651     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
652     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
653     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
654     cusparseTriFactors->cpermIndices->assign(rip, rip+n);
655     ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr);
656   }
657   ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
658   PetscFunctionReturn(0);
659 }
660 
661 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
662 {
663   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
664   IS             isrow = b->row,iscol = b->col;
665   PetscBool      row_identity,col_identity;
666   PetscErrorCode ierr;
667 
668   PetscFunctionBegin;
669   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
670   /* determine which version of MatSolve needs to be used. */
671   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
672   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
673   if (row_identity && col_identity) {
674     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
675     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
676   } else {
677     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
678     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
679   }
680 
681   /* get the triangular factors */
682   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
683   PetscFunctionReturn(0);
684 }
685 
686 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
687 {
688   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
689   IS             ip = b->row;
690   PetscBool      perm_identity;
691   PetscErrorCode ierr;
692 
693   PetscFunctionBegin;
694   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
695 
696   /* determine which version of MatSolve needs to be used. */
697   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
698   if (perm_identity) {
699     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
700     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
701   } else {
702     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
703     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
704   }
705 
706   /* get the triangular factors */
707   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
708   PetscFunctionReturn(0);
709 }
710 
711 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
712 {
713   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
714   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
715   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
716   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
717   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
718   cusparseStatus_t                  stat;
719   cusparseIndexBase_t               indexBase;
720   cusparseMatrixType_t              matrixType;
721   cusparseFillMode_t                fillMode;
722   cusparseDiagType_t                diagType;
723 
724   PetscFunctionBegin;
725 
726   /*********************************************/
727   /* Now the Transpose of the Lower Tri Factor */
728   /*********************************************/
729 
730   /* allocate space for the transpose of the lower triangular factor */
731   loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
732 
733   /* set the matrix descriptors of the lower triangular factor */
734   matrixType = cusparseGetMatType(loTriFactor->descr);
735   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
736   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
737     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
738   diagType = cusparseGetMatDiagType(loTriFactor->descr);
739 
740   /* Create the matrix description */
741   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUDA(stat);
742   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUDA(stat);
743   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUDA(stat);
744   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUDA(stat);
745   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUDA(stat);
746 
747   /* Create the solve analysis information */
748   stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUDA(stat);
749 
750   /* set the operation */
751   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
752 
753   /* allocate GPU space for the CSC of the lower triangular factor*/
754   loTriFactorT->csrMat = new CsrMatrix;
755   loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows;
756   loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols;
757   loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
758   loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1);
759   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries);
760   loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries);
761 
762   /* compute the transpose of the lower triangular factor, i.e. the CSC */
763   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
764                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
765                           loTriFactor->csrMat->values->data().get(),
766                           loTriFactor->csrMat->row_offsets->data().get(),
767                           loTriFactor->csrMat->column_indices->data().get(),
768                           loTriFactorT->csrMat->values->data().get(),
769                           loTriFactorT->csrMat->column_indices->data().get(),
770                           loTriFactorT->csrMat->row_offsets->data().get(),
771                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
772 
773   /* perform the solve analysis on the transposed matrix */
774   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
775                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries,
776                            loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
777                            loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(),
778                            loTriFactorT->solveInfo);CHKERRCUDA(stat);
779 
780   /* assign the pointer. Is this really necessary? */
781   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
782 
783   /*********************************************/
784   /* Now the Transpose of the Upper Tri Factor */
785   /*********************************************/
786 
787   /* allocate space for the transpose of the upper triangular factor */
788   upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
789 
790   /* set the matrix descriptors of the upper triangular factor */
791   matrixType = cusparseGetMatType(upTriFactor->descr);
792   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
793   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
794     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
795   diagType = cusparseGetMatDiagType(upTriFactor->descr);
796 
797   /* Create the matrix description */
798   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUDA(stat);
799   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUDA(stat);
800   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUDA(stat);
801   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUDA(stat);
802   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUDA(stat);
803 
804   /* Create the solve analysis information */
805   stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUDA(stat);
806 
807   /* set the operation */
808   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
809 
810   /* allocate GPU space for the CSC of the upper triangular factor*/
811   upTriFactorT->csrMat = new CsrMatrix;
812   upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows;
813   upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols;
814   upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
815   upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1);
816   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries);
817   upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries);
818 
819   /* compute the transpose of the upper triangular factor, i.e. the CSC */
820   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
821                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
822                           upTriFactor->csrMat->values->data().get(),
823                           upTriFactor->csrMat->row_offsets->data().get(),
824                           upTriFactor->csrMat->column_indices->data().get(),
825                           upTriFactorT->csrMat->values->data().get(),
826                           upTriFactorT->csrMat->column_indices->data().get(),
827                           upTriFactorT->csrMat->row_offsets->data().get(),
828                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
829 
830   /* perform the solve analysis on the transposed matrix */
831   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
832                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries,
833                            upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
834                            upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(),
835                            upTriFactorT->solveInfo);CHKERRCUDA(stat);
836 
837   /* assign the pointer. Is this really necessary? */
838   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
839   PetscFunctionReturn(0);
840 }
841 
842 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
843 {
844   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
845   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
846   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
847   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
848   cusparseStatus_t             stat;
849   cusparseIndexBase_t          indexBase;
850   cudaError_t                  err;
851   PetscErrorCode               ierr;
852 
853   PetscFunctionBegin;
854 
855   /* allocate space for the triangular factor information */
856   matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
857   stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUDA(stat);
858   indexBase = cusparseGetMatIndexBase(matstruct->descr);
859   stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUDA(stat);
860   stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat);
861 
862   /* set alpha and beta */
863   err = cudaMalloc((void **)&(matstructT->alpha),    sizeof(PetscScalar));CHKERRCUDA(err);
864   err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
865   err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
866   err = cudaMemcpy(matstructT->alpha,    &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
867   err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
868   err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
869   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
870 
871   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
872     CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
873     CsrMatrix *matrixT= new CsrMatrix;
874     matrixT->num_rows = A->cmap->n;
875     matrixT->num_cols = A->rmap->n;
876     matrixT->num_entries = a->nz;
877     matrixT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
878     matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
879     matrixT->values = new THRUSTARRAY(a->nz);
880 
881     /* compute the transpose of the upper triangular factor, i.e. the CSC */
882     indexBase = cusparseGetMatIndexBase(matstruct->descr);
883     stat = cusparse_csr2csc(cusparsestruct->handle, matrix->num_rows,
884                             matrix->num_cols, matrix->num_entries,
885                             matrix->values->data().get(),
886                             matrix->row_offsets->data().get(),
887                             matrix->column_indices->data().get(),
888                             matrixT->values->data().get(),
889                             matrixT->column_indices->data().get(),
890                             matrixT->row_offsets->data().get(),
891                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
892 
893     /* assign the pointer */
894     matstructT->mat = matrixT;
895     ierr = PetscLogCpuToGpu(((A->rmap->n+1)+(a->nz))*sizeof(int)+(3+a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
896   } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
897 #if CUDA_VERSION>=5000
898     /* First convert HYB to CSR */
899     CsrMatrix *temp= new CsrMatrix;
900     temp->num_rows = A->rmap->n;
901     temp->num_cols = A->cmap->n;
902     temp->num_entries = a->nz;
903     temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
904     temp->column_indices = new THRUSTINTARRAY32(a->nz);
905     temp->values = new THRUSTARRAY(a->nz);
906 
907 
908     stat = cusparse_hyb2csr(cusparsestruct->handle,
909                             matstruct->descr, (cusparseHybMat_t)matstruct->mat,
910                             temp->values->data().get(),
911                             temp->row_offsets->data().get(),
912                             temp->column_indices->data().get());CHKERRCUDA(stat);
913 
914     /* Next, convert CSR to CSC (i.e. the matrix transpose) */
915     CsrMatrix *tempT= new CsrMatrix;
916     tempT->num_rows = A->rmap->n;
917     tempT->num_cols = A->cmap->n;
918     tempT->num_entries = a->nz;
919     tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
920     tempT->column_indices = new THRUSTINTARRAY32(a->nz);
921     tempT->values = new THRUSTARRAY(a->nz);
922 
923     stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
924                             temp->num_cols, temp->num_entries,
925                             temp->values->data().get(),
926                             temp->row_offsets->data().get(),
927                             temp->column_indices->data().get(),
928                             tempT->values->data().get(),
929                             tempT->column_indices->data().get(),
930                             tempT->row_offsets->data().get(),
931                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
932 
933     /* Last, convert CSC to HYB */
934     cusparseHybMat_t hybMat;
935     stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat);
936     cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
937       CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
938     stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
939                             matstructT->descr, tempT->values->data().get(),
940                             tempT->row_offsets->data().get(),
941                             tempT->column_indices->data().get(),
942                             hybMat, 0, partition);CHKERRCUDA(stat);
943 
944     /* assign the pointer */
945     matstructT->mat = hybMat;
946     ierr = PetscLogCpuToGpu((2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)))+3*sizeof(PetscScalar));CHKERRQ(ierr);
947 
948     /* delete temporaries */
949     if (tempT) {
950       if (tempT->values) delete (THRUSTARRAY*) tempT->values;
951       if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
952       if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
953       delete (CsrMatrix*) tempT;
954     }
955     if (temp) {
956       if (temp->values) delete (THRUSTARRAY*) temp->values;
957       if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
958       if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
959       delete (CsrMatrix*) temp;
960     }
961 #else
962     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format for the Matrix Transpose (in MatMultTranspose) require CUDA 5.0 or later.");
963 #endif
964   }
965   /* assign the compressed row indices */
966   matstructT->cprowIndices = new THRUSTINTARRAY;
967   matstructT->cprowIndices->resize(A->cmap->n);
968   thrust::sequence(matstructT->cprowIndices->begin(), matstructT->cprowIndices->end());
969   /* assign the pointer */
970   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
971   PetscFunctionReturn(0);
972 }
973 
974 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
975 {
976   PetscInt                              n = xx->map->n;
977   const PetscScalar                     *barray;
978   PetscScalar                           *xarray;
979   thrust::device_ptr<const PetscScalar> bGPU;
980   thrust::device_ptr<PetscScalar>       xGPU;
981   cusparseStatus_t                      stat;
982   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
983   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
984   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
985   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
986   PetscErrorCode                        ierr;
987 
988   PetscFunctionBegin;
989   /* Analyze the matrix and create the transpose ... on the fly */
990   if (!loTriFactorT && !upTriFactorT) {
991     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
992     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
993     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
994   }
995 
996   /* Get the GPU pointers */
997   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
998   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
999   xGPU = thrust::device_pointer_cast(xarray);
1000   bGPU = thrust::device_pointer_cast(barray);
1001 
1002   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1003   /* First, reorder with the row permutation */
1004   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1005                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1006                xGPU);
1007 
1008   /* First, solve U */
1009   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1010                         upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1011                         upTriFactorT->csrMat->values->data().get(),
1012                         upTriFactorT->csrMat->row_offsets->data().get(),
1013                         upTriFactorT->csrMat->column_indices->data().get(),
1014                         upTriFactorT->solveInfo,
1015                         xarray, tempGPU->data().get());CHKERRCUDA(stat);
1016 
1017   /* Then, solve L */
1018   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1019                         loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1020                         loTriFactorT->csrMat->values->data().get(),
1021                         loTriFactorT->csrMat->row_offsets->data().get(),
1022                         loTriFactorT->csrMat->column_indices->data().get(),
1023                         loTriFactorT->solveInfo,
1024                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1025   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1026 
1027   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1028   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1029                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1030                tempGPU->begin());
1031 
1032   /* Copy the temporary to the full solution. */
1033   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1034 
1035   /* restore */
1036   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1037   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1038   ierr = WaitForGPU();CHKERRCUDA(ierr);
1039   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1040   PetscFunctionReturn(0);
1041 }
1042 
1043 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1044 {
1045   const PetscScalar                 *barray;
1046   PetscScalar                       *xarray;
1047   cusparseStatus_t                  stat;
1048   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1049   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1050   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1051   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1052   PetscErrorCode                    ierr;
1053 
1054   PetscFunctionBegin;
1055   /* Analyze the matrix and create the transpose ... on the fly */
1056   if (!loTriFactorT && !upTriFactorT) {
1057     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1058     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1059     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1060   }
1061 
1062   /* Get the GPU pointers */
1063   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1064   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1065 
1066   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1067   /* First, solve U */
1068   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1069                         upTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1070                         upTriFactorT->csrMat->values->data().get(),
1071                         upTriFactorT->csrMat->row_offsets->data().get(),
1072                         upTriFactorT->csrMat->column_indices->data().get(),
1073                         upTriFactorT->solveInfo,
1074                         barray, tempGPU->data().get());CHKERRCUDA(stat);
1075 
1076   /* Then, solve L */
1077   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1078                         loTriFactorT->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1079                         loTriFactorT->csrMat->values->data().get(),
1080                         loTriFactorT->csrMat->row_offsets->data().get(),
1081                         loTriFactorT->csrMat->column_indices->data().get(),
1082                         loTriFactorT->solveInfo,
1083                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1084   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1085 
1086   /* restore */
1087   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1088   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1089   ierr = WaitForGPU();CHKERRCUDA(ierr);
1090   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1091   PetscFunctionReturn(0);
1092 }
1093 
1094 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1095 {
1096   const PetscScalar                     *barray;
1097   PetscScalar                           *xarray;
1098   thrust::device_ptr<const PetscScalar> bGPU;
1099   thrust::device_ptr<PetscScalar>       xGPU;
1100   cusparseStatus_t                      stat;
1101   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1102   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1103   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1104   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1105   PetscErrorCode                        ierr;
1106 
1107   PetscFunctionBegin;
1108 
1109   /* Get the GPU pointers */
1110   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1111   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1112   xGPU = thrust::device_pointer_cast(xarray);
1113   bGPU = thrust::device_pointer_cast(barray);
1114 
1115   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1116   /* First, reorder with the row permutation */
1117   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1118                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1119                xGPU);
1120 
1121   /* Next, solve L */
1122   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1123                         loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1124                         loTriFactor->csrMat->values->data().get(),
1125                         loTriFactor->csrMat->row_offsets->data().get(),
1126                         loTriFactor->csrMat->column_indices->data().get(),
1127                         loTriFactor->solveInfo,
1128                         xarray, tempGPU->data().get());CHKERRCUDA(stat);
1129 
1130   /* Then, solve U */
1131   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1132                         upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1133                         upTriFactor->csrMat->values->data().get(),
1134                         upTriFactor->csrMat->row_offsets->data().get(),
1135                         upTriFactor->csrMat->column_indices->data().get(),
1136                         upTriFactor->solveInfo,
1137                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1138   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1139 
1140   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1141   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1142                thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->end()),
1143                tempGPU->begin());
1144 
1145   /* Copy the temporary to the full solution. */
1146   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1147 
1148   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1149   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1150   ierr = WaitForGPU();CHKERRCUDA(ierr);
1151   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1152   PetscFunctionReturn(0);
1153 }
1154 
1155 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1156 {
1157   const PetscScalar                 *barray;
1158   PetscScalar                       *xarray;
1159   cusparseStatus_t                  stat;
1160   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1161   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1162   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1163   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1164   PetscErrorCode                    ierr;
1165 
1166   PetscFunctionBegin;
1167   /* Get the GPU pointers */
1168   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1169   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1170 
1171   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1172   /* First, solve L */
1173   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1174                         loTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1175                         loTriFactor->csrMat->values->data().get(),
1176                         loTriFactor->csrMat->row_offsets->data().get(),
1177                         loTriFactor->csrMat->column_indices->data().get(),
1178                         loTriFactor->solveInfo,
1179                         barray, tempGPU->data().get());CHKERRCUDA(stat);
1180 
1181   /* Next, solve U */
1182   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1183                         upTriFactor->csrMat->num_rows, &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1184                         upTriFactor->csrMat->values->data().get(),
1185                         upTriFactor->csrMat->row_offsets->data().get(),
1186                         upTriFactor->csrMat->column_indices->data().get(),
1187                         upTriFactor->solveInfo,
1188                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1189   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1190 
1191   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1192   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1193   ierr = WaitForGPU();CHKERRCUDA(ierr);
1194   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1195   PetscFunctionReturn(0);
1196 }
1197 
1198 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1199 {
1200 
1201   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1202   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1203   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1204   PetscInt                     m = A->rmap->n,*ii,*ridx;
1205   PetscErrorCode               ierr;
1206   cusparseStatus_t             stat;
1207   cudaError_t                  err;
1208 
1209   PetscFunctionBegin;
1210   if (A->pinnedtocpu) PetscFunctionReturn(0);
1211   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
1212     ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1213     if (A->assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) {
1214       CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
1215       /* copy values only */
1216       matrix->values->assign(a->a, a->a+a->nz);
1217       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
1218     } else {
1219       MatSeqAIJCUSPARSEMultStruct_Destroy(&matstruct,cusparsestruct->format);
1220       try {
1221         cusparsestruct->nonzerorow=0;
1222         for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0);
1223 
1224         if (a->compressedrow.use) {
1225           m    = a->compressedrow.nrows;
1226           ii   = a->compressedrow.i;
1227           ridx = a->compressedrow.rindex;
1228         } else {
1229           /* Forcing compressed row on the GPU */
1230           int k=0;
1231           ierr = PetscMalloc1(cusparsestruct->nonzerorow+1, &ii);CHKERRQ(ierr);
1232           ierr = PetscMalloc1(cusparsestruct->nonzerorow, &ridx);CHKERRQ(ierr);
1233           ii[0]=0;
1234           for (int j = 0; j<m; j++) {
1235             if ((a->i[j+1]-a->i[j])>0) {
1236               ii[k]  = a->i[j];
1237               ridx[k]= j;
1238               k++;
1239             }
1240           }
1241           ii[cusparsestruct->nonzerorow] = a->nz;
1242           m = cusparsestruct->nonzerorow;
1243         }
1244 
1245         /* allocate space for the triangular factor information */
1246         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1247         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUDA(stat);
1248         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
1249         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat);
1250 
1251         err = cudaMalloc((void **)&(matstruct->alpha),    sizeof(PetscScalar));CHKERRCUDA(err);
1252         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1253         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1254         err = cudaMemcpy(matstruct->alpha,    &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1255         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1256         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1257         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
1258 
1259         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1260         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1261           /* set the matrix */
1262           CsrMatrix *matrix= new CsrMatrix;
1263           matrix->num_rows = m;
1264           matrix->num_cols = A->cmap->n;
1265           matrix->num_entries = a->nz;
1266           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1267           matrix->row_offsets->assign(ii, ii + m+1);
1268 
1269           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1270           matrix->column_indices->assign(a->j, a->j+a->nz);
1271 
1272           matrix->values = new THRUSTARRAY(a->nz);
1273           matrix->values->assign(a->a, a->a+a->nz);
1274 
1275           /* assign the pointer */
1276           matstruct->mat = matrix;
1277 
1278         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1279 #if CUDA_VERSION>=4020
1280           CsrMatrix *matrix= new CsrMatrix;
1281           matrix->num_rows = m;
1282           matrix->num_cols = A->cmap->n;
1283           matrix->num_entries = a->nz;
1284           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1285           matrix->row_offsets->assign(ii, ii + m+1);
1286 
1287           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1288           matrix->column_indices->assign(a->j, a->j+a->nz);
1289 
1290           matrix->values = new THRUSTARRAY(a->nz);
1291           matrix->values->assign(a->a, a->a+a->nz);
1292 
1293           cusparseHybMat_t hybMat;
1294           stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat);
1295           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1296             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1297           stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols,
1298               matstruct->descr, matrix->values->data().get(),
1299               matrix->row_offsets->data().get(),
1300               matrix->column_indices->data().get(),
1301               hybMat, 0, partition);CHKERRCUDA(stat);
1302           /* assign the pointer */
1303           matstruct->mat = hybMat;
1304 
1305           if (matrix) {
1306             if (matrix->values) delete (THRUSTARRAY*)matrix->values;
1307             if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices;
1308             if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets;
1309             delete (CsrMatrix*)matrix;
1310           }
1311 #endif
1312         }
1313 
1314         /* assign the compressed row indices */
1315         matstruct->cprowIndices = new THRUSTINTARRAY(m);
1316         matstruct->cprowIndices->assign(ridx,ridx+m);
1317         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+m*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
1318 
1319         /* assign the pointer */
1320         cusparsestruct->mat = matstruct;
1321 
1322         if (!a->compressedrow.use) {
1323           ierr = PetscFree(ii);CHKERRQ(ierr);
1324           ierr = PetscFree(ridx);CHKERRQ(ierr);
1325         }
1326         cusparsestruct->workVector = new THRUSTARRAY(m);
1327       } catch(char *ex) {
1328         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1329       }
1330       cusparsestruct->nonzerostate = A->nonzerostate;
1331     }
1332     ierr = WaitForGPU();CHKERRCUDA(ierr);
1333     A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH;
1334     ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1335   }
1336   PetscFunctionReturn(0);
1337 }
1338 
1339 struct VecCUDAPlusEquals
1340 {
1341   template <typename Tuple>
1342   __host__ __device__
1343   void operator()(Tuple t)
1344   {
1345     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1346   }
1347 };
1348 
1349 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1350 {
1351   PetscErrorCode ierr;
1352 
1353   PetscFunctionBegin;
1354   ierr = MatMultAdd_SeqAIJCUSPARSE(A,xx,NULL,yy);CHKERRQ(ierr);
1355   PetscFunctionReturn(0);
1356 }
1357 
1358 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1359 {
1360   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1361   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1362   Mat_SeqAIJCUSPARSEMultStruct *matstructT;
1363   const PetscScalar            *xarray;
1364   PetscScalar                  *yarray;
1365   PetscErrorCode               ierr;
1366   cusparseStatus_t             stat;
1367 
1368   PetscFunctionBegin;
1369   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1370   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1371   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1372   if (!matstructT) {
1373     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1374     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1375   }
1376   ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1377   ierr = VecSet(yy,0);CHKERRQ(ierr);
1378   ierr = VecCUDAGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
1379 
1380   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1381   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1382     CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1383     stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1384                              mat->num_rows, mat->num_cols,
1385                              mat->num_entries, matstructT->alpha, matstructT->descr,
1386                              mat->values->data().get(), mat->row_offsets->data().get(),
1387                              mat->column_indices->data().get(), xarray, matstructT->beta_zero,
1388                              yarray);CHKERRCUDA(stat);
1389   } else {
1390 #if CUDA_VERSION>=4020
1391     cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1392     stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1393                              matstructT->alpha, matstructT->descr, hybMat,
1394                              xarray, matstructT->beta_zero,
1395                              yarray);CHKERRCUDA(stat);
1396 #endif
1397   }
1398   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1399   ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1400   ierr = VecCUDARestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
1401   if (!cusparsestruct->stream) {
1402     ierr = WaitForGPU();CHKERRCUDA(ierr);
1403   }
1404   ierr = PetscLogGpuFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr);
1405   PetscFunctionReturn(0);
1406 }
1407 
1408 
1409 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1410 {
1411   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1412   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1413   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
1414   const PetscScalar            *xarray;
1415   PetscScalar                  *zarray,*dptr,*beta;
1416   PetscErrorCode               ierr;
1417   cusparseStatus_t             stat;
1418 
1419   PetscFunctionBegin;
1420   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1421   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1422   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1423   try {
1424     ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1425     ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);
1426     dptr = cusparsestruct->workVector->size() == (thrust::detail::vector_base<PetscScalar, thrust::device_malloc_allocator<PetscScalar> >::size_type)(A->rmap->n) ? zarray : cusparsestruct->workVector->data().get();
1427     beta = (yy == zz && dptr == zarray) ? matstruct->beta_one : matstruct->beta_zero;
1428 
1429     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1430     /* csr_spmv is multiply add */
1431     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1432       /* here we need to be careful to set the number of rows in the multiply to the
1433          number of compressed rows in the matrix ... which is equivalent to the
1434          size of the workVector */
1435       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1436       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1437                                mat->num_rows, mat->num_cols,
1438                                mat->num_entries, matstruct->alpha, matstruct->descr,
1439                                mat->values->data().get(), mat->row_offsets->data().get(),
1440                                mat->column_indices->data().get(), xarray, beta,
1441                                dptr);CHKERRCUDA(stat);
1442     } else {
1443 #if CUDA_VERSION>=4020
1444       cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1445       if (cusparsestruct->workVector->size()) {
1446         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1447                                  matstruct->alpha, matstruct->descr, hybMat,
1448                                  xarray, beta,
1449                                  dptr);CHKERRCUDA(stat);
1450       }
1451 #endif
1452     }
1453     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1454 
1455     if (yy) {
1456       if (dptr != zarray) {
1457         ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);
1458       } else if (zz != yy) {
1459         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr);
1460       }
1461     } else if (dptr != zarray) {
1462       ierr = VecSet(zz,0);CHKERRQ(ierr);
1463     }
1464     /* scatter the data from the temporary into the full vector with a += operation */
1465     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1466     if (dptr != zarray) {
1467       thrust::device_ptr<PetscScalar> zptr;
1468 
1469       zptr = thrust::device_pointer_cast(zarray);
1470       thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
1471                        thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(),
1472                        VecCUDAPlusEquals());
1473     }
1474     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1475     ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1476     ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);
1477   } catch(char *ex) {
1478     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1479   }
1480   if (!yy) { /* MatMult */
1481     if (!cusparsestruct->stream) {
1482       ierr = WaitForGPU();CHKERRCUDA(ierr);
1483     }
1484   }
1485   ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
1486   PetscFunctionReturn(0);
1487 }
1488 
1489 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1490 {
1491   Mat_SeqAIJ                      *a = (Mat_SeqAIJ*)A->data;
1492   Mat_SeqAIJCUSPARSE              *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1493   Mat_SeqAIJCUSPARSEMultStruct    *matstructT;
1494   thrust::device_ptr<PetscScalar> zptr;
1495   const PetscScalar               *xarray;
1496   PetscScalar                     *zarray;
1497   PetscErrorCode                  ierr;
1498   cusparseStatus_t                stat;
1499 
1500   PetscFunctionBegin;
1501   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1502   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1503   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1504   if (!matstructT) {
1505     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1506     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1507   }
1508 
1509   try {
1510     ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);
1511     ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1512     ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);
1513     zptr = thrust::device_pointer_cast(zarray);
1514 
1515     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1516     /* multiply add with matrix transpose */
1517     if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1518       CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1519       /* here we need to be careful to set the number of rows in the multiply to the
1520          number of compressed rows in the matrix ... which is equivalent to the
1521          size of the workVector */
1522       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1523                                mat->num_rows, mat->num_cols,
1524                                mat->num_entries, matstructT->alpha, matstructT->descr,
1525                                mat->values->data().get(), mat->row_offsets->data().get(),
1526                                mat->column_indices->data().get(), xarray, matstructT->beta_zero,
1527                                cusparsestruct->workVector->data().get());CHKERRCUDA(stat);
1528     } else {
1529 #if CUDA_VERSION>=4020
1530       cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1531       if (cusparsestruct->workVector->size()) {
1532         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1533             matstructT->alpha, matstructT->descr, hybMat,
1534             xarray, matstructT->beta_zero,
1535             cusparsestruct->workVector->data().get());CHKERRCUDA(stat);
1536       }
1537 #endif
1538     }
1539 
1540     /* scatter the data from the temporary into the full vector with a += operation */
1541     thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))),
1542         thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))) + A->cmap->n,
1543         VecCUDAPlusEquals());
1544     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1545     ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1546     ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);
1547 
1548   } catch(char *ex) {
1549     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1550   }
1551   ierr = WaitForGPU();CHKERRCUDA(ierr);
1552   ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
1553   PetscFunctionReturn(0);
1554 }
1555 
1556 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
1557 {
1558   PetscErrorCode ierr;
1559 
1560   PetscFunctionBegin;
1561   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
1562   if (A->factortype==MAT_FACTOR_NONE) {
1563     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1564   }
1565   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
1566   A->ops->mult             = MatMult_SeqAIJCUSPARSE;
1567   A->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1568   A->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1569   A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1570   PetscFunctionReturn(0);
1571 }
1572 
1573 /* --------------------------------------------------------------------------------*/
1574 /*@
1575    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
1576    (the default parallel PETSc format). This matrix will ultimately pushed down
1577    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
1578    assembly performance the user should preallocate the matrix storage by setting
1579    the parameter nz (or the array nnz).  By setting these parameters accurately,
1580    performance during matrix assembly can be increased by more than a factor of 50.
1581 
1582    Collective
1583 
1584    Input Parameters:
1585 +  comm - MPI communicator, set to PETSC_COMM_SELF
1586 .  m - number of rows
1587 .  n - number of columns
1588 .  nz - number of nonzeros per row (same for all rows)
1589 -  nnz - array containing the number of nonzeros in the various rows
1590          (possibly different for each row) or NULL
1591 
1592    Output Parameter:
1593 .  A - the matrix
1594 
1595    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1596    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1597    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
1598 
1599    Notes:
1600    If nnz is given then nz is ignored
1601 
1602    The AIJ format (also called the Yale sparse matrix format or
1603    compressed row storage), is fully compatible with standard Fortran 77
1604    storage.  That is, the stored row and column indices can begin at
1605    either one (as in Fortran) or zero.  See the users' manual for details.
1606 
1607    Specify the preallocated storage with either nz or nnz (not both).
1608    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1609    allocation.  For large problems you MUST preallocate memory or you
1610    will get TERRIBLE performance, see the users' manual chapter on matrices.
1611 
1612    By default, this format uses inodes (identical nodes) when possible, to
1613    improve numerical efficiency of matrix-vector products and solves. We
1614    search for consecutive rows with the same nonzero structure, thereby
1615    reusing matrix information to achieve increased efficiency.
1616 
1617    Level: intermediate
1618 
1619 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
1620 @*/
1621 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1622 {
1623   PetscErrorCode ierr;
1624 
1625   PetscFunctionBegin;
1626   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1627   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
1628   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1629   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
1630   PetscFunctionReturn(0);
1631 }
1632 
1633 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
1634 {
1635   PetscErrorCode   ierr;
1636 
1637   PetscFunctionBegin;
1638   if (A->factortype==MAT_FACTOR_NONE) {
1639     if (A->valid_GPU_matrix != PETSC_OFFLOAD_UNALLOCATED) {
1640       ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
1641     }
1642   } else {
1643     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
1644   }
1645   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
1646   PetscFunctionReturn(0);
1647 }
1648 
1649 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
1650 {
1651   PetscErrorCode ierr;
1652   Mat C;
1653   cusparseStatus_t stat;
1654   cusparseHandle_t handle=0;
1655 
1656   PetscFunctionBegin;
1657   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
1658   C    = *B;
1659   ierr = PetscFree(C->defaultvectype);CHKERRQ(ierr);
1660   ierr = PetscStrallocpy(VECCUDA,&C->defaultvectype);CHKERRQ(ierr);
1661 
1662   /* inject CUSPARSE-specific stuff */
1663   if (C->factortype==MAT_FACTOR_NONE) {
1664     /* you cannot check the inode.use flag here since the matrix was just created.
1665        now build a GPU matrix data structure */
1666     C->spptr = new Mat_SeqAIJCUSPARSE;
1667     ((Mat_SeqAIJCUSPARSE*)C->spptr)->mat          = 0;
1668     ((Mat_SeqAIJCUSPARSE*)C->spptr)->matTranspose = 0;
1669     ((Mat_SeqAIJCUSPARSE*)C->spptr)->workVector   = 0;
1670     ((Mat_SeqAIJCUSPARSE*)C->spptr)->format       = MAT_CUSPARSE_CSR;
1671     ((Mat_SeqAIJCUSPARSE*)C->spptr)->stream       = 0;
1672     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1673     ((Mat_SeqAIJCUSPARSE*)C->spptr)->handle       = handle;
1674     ((Mat_SeqAIJCUSPARSE*)C->spptr)->nonzerostate = 0;
1675   } else {
1676     /* NEXT, set the pointers to the triangular factors */
1677     C->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1678     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtr          = 0;
1679     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtr          = 0;
1680     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtrTranspose = 0;
1681     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtrTranspose = 0;
1682     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->rpermIndices            = 0;
1683     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->cpermIndices            = 0;
1684     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->workVector              = 0;
1685     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle                  = 0;
1686     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1687     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle                  = handle;
1688     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->nnz                     = 0;
1689   }
1690 
1691   C->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
1692   C->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
1693   C->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
1694   C->ops->mult             = MatMult_SeqAIJCUSPARSE;
1695   C->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1696   C->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1697   C->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1698   C->ops->duplicate        = MatDuplicate_SeqAIJCUSPARSE;
1699 
1700   ierr = PetscObjectChangeTypeName((PetscObject)C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1701 
1702   C->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED;
1703 
1704   ierr = PetscObjectComposeFunction((PetscObject)C, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
1705   PetscFunctionReturn(0);
1706 }
1707 
1708 PETSC_EXTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat B)
1709 {
1710   PetscErrorCode ierr;
1711   cusparseStatus_t stat;
1712   cusparseHandle_t handle=0;
1713 
1714   PetscFunctionBegin;
1715   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
1716   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
1717 
1718   if (B->factortype==MAT_FACTOR_NONE) {
1719     /* you cannot check the inode.use flag here since the matrix was just created.
1720        now build a GPU matrix data structure */
1721     B->spptr = new Mat_SeqAIJCUSPARSE;
1722     ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat          = 0;
1723     ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0;
1724     ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector   = 0;
1725     ((Mat_SeqAIJCUSPARSE*)B->spptr)->format       = MAT_CUSPARSE_CSR;
1726     ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream       = 0;
1727     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1728     ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle       = handle;
1729     ((Mat_SeqAIJCUSPARSE*)B->spptr)->nonzerostate = 0;
1730   } else {
1731     /* NEXT, set the pointers to the triangular factors */
1732     B->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1733     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr          = 0;
1734     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr          = 0;
1735     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0;
1736     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0;
1737     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices            = 0;
1738     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices            = 0;
1739     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector              = 0;
1740     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1741     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle                  = handle;
1742     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz                     = 0;
1743   }
1744 
1745   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
1746   B->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
1747   B->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
1748   B->ops->mult             = MatMult_SeqAIJCUSPARSE;
1749   B->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1750   B->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1751   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1752   B->ops->duplicate        = MatDuplicate_SeqAIJCUSPARSE;
1753 
1754   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1755 
1756   B->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED;
1757 
1758   ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
1759   PetscFunctionReturn(0);
1760 }
1761 
1762 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
1763 {
1764   PetscErrorCode ierr;
1765 
1766   PetscFunctionBegin;
1767   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
1768   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B);
1769   PetscFunctionReturn(0);
1770 }
1771 
1772 /*MC
1773    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
1774 
1775    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
1776    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
1777    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
1778 
1779    Options Database Keys:
1780 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
1781 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1782 .  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1783 
1784   Level: beginner
1785 
1786 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
1787 M*/
1788 
1789 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);
1790 
1791 
1792 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
1793 {
1794   PetscErrorCode ierr;
1795 
1796   PetscFunctionBegin;
1797   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1798   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1799   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1800   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1801   PetscFunctionReturn(0);
1802 }
1803 
1804 
1805 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
1806 {
1807   cusparseStatus_t stat;
1808   cusparseHandle_t handle;
1809 
1810   PetscFunctionBegin;
1811   if (*cusparsestruct) {
1812     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);
1813     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);
1814     delete (*cusparsestruct)->workVector;
1815     if (handle = (*cusparsestruct)->handle) {
1816       stat = cusparseDestroy(handle);CHKERRCUDA(stat);
1817     }
1818     delete *cusparsestruct;
1819     *cusparsestruct = 0;
1820   }
1821   PetscFunctionReturn(0);
1822 }
1823 
1824 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
1825 {
1826   PetscFunctionBegin;
1827   if (*mat) {
1828     delete (*mat)->values;
1829     delete (*mat)->column_indices;
1830     delete (*mat)->row_offsets;
1831     delete *mat;
1832     *mat = 0;
1833   }
1834   PetscFunctionReturn(0);
1835 }
1836 
1837 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
1838 {
1839   cusparseStatus_t stat;
1840   PetscErrorCode   ierr;
1841 
1842   PetscFunctionBegin;
1843   if (*trifactor) {
1844     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUDA(stat); }
1845     if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUDA(stat); }
1846     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
1847     delete *trifactor;
1848     *trifactor = 0;
1849   }
1850   PetscFunctionReturn(0);
1851 }
1852 
1853 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
1854 {
1855   CsrMatrix        *mat;
1856   cusparseStatus_t stat;
1857   cudaError_t      err;
1858 
1859   PetscFunctionBegin;
1860   if (*matstruct) {
1861     if ((*matstruct)->mat) {
1862       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
1863         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
1864         stat = cusparseDestroyHybMat(hybMat);CHKERRCUDA(stat);
1865       } else {
1866         mat = (CsrMatrix*)(*matstruct)->mat;
1867         CsrMatrix_Destroy(&mat);
1868       }
1869     }
1870     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUDA(stat); }
1871     delete (*matstruct)->cprowIndices;
1872     if ((*matstruct)->alpha)     { err=cudaFree((*matstruct)->alpha);CHKERRCUDA(err); }
1873     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
1874     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
1875     delete *matstruct;
1876     *matstruct = 0;
1877   }
1878   PetscFunctionReturn(0);
1879 }
1880 
1881 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
1882 {
1883   cusparseHandle_t handle;
1884   cusparseStatus_t stat;
1885 
1886   PetscFunctionBegin;
1887   if (*trifactors) {
1888     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);
1889     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);
1890     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);
1891     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);
1892     delete (*trifactors)->rpermIndices;
1893     delete (*trifactors)->cpermIndices;
1894     delete (*trifactors)->workVector;
1895     if (handle = (*trifactors)->handle) {
1896       stat = cusparseDestroy(handle);CHKERRCUDA(stat);
1897     }
1898     delete *trifactors;
1899     *trifactors = 0;
1900   }
1901   PetscFunctionReturn(0);
1902 }
1903 
1904