xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 487a658c8b32ba712a1dc8280daad2fd70c1dcd9)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 
7 #include <petscconf.h>
8 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
9 #include <../src/mat/impls/sbaij/seq/sbaij.h>
10 #include <../src/vec/vec/impls/dvecimpl.h>
11 #include <petsc/private/vecimpl.h>
12 #undef VecType
13 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
14 
15 const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
16 
17 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
18 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
19 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
20 
21 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
22 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
23 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
24 
25 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
26 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
27 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
28 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
29 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
30 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
31 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
32 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
33 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
34 
35 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
36 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
37 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
38 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
39 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
40 
41 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
42 {
43   cusparseStatus_t   stat;
44   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
45 
46   PetscFunctionBegin;
47   cusparsestruct->stream = stream;
48   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUDA(stat);
49   PetscFunctionReturn(0);
50 }
51 
52 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
53 {
54   cusparseStatus_t   stat;
55   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
56 
57   PetscFunctionBegin;
58   if (cusparsestruct->handle != handle) {
59     if (cusparsestruct->handle) {
60       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUDA(stat);
61     }
62     cusparsestruct->handle = handle;
63   }
64   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
65   PetscFunctionReturn(0);
66 }
67 
68 PetscErrorCode MatCUSPARSEClearHandle(Mat A)
69 {
70   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
71   PetscFunctionBegin;
72   if (cusparsestruct->handle)
73     cusparsestruct->handle = 0;
74   PetscFunctionReturn(0);
75 }
76 
77 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
78 {
79   PetscFunctionBegin;
80   *type = MATSOLVERCUSPARSE;
81   PetscFunctionReturn(0);
82 }
83 
84 /*MC
85   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
86   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
87   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
88   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
89   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
90   algorithms are not recommended. This class does NOT support direct solver operations.
91 
92   Level: beginner
93 
94 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
95 M*/
96 
97 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
98 {
99   PetscErrorCode ierr;
100   PetscInt       n = A->rmap->n;
101 
102   PetscFunctionBegin;
103   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
104   (*B)->factortype = ftype;
105   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
106   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
107 
108   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
109     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
110     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
111     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
112   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
113     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
114     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
115   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
116 
117   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
118   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
119   PetscFunctionReturn(0);
120 }
121 
122 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
123 {
124   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
125 
126   PetscFunctionBegin;
127 #if CUDA_VERSION>=4020
128   switch (op) {
129   case MAT_CUSPARSE_MULT:
130     cusparsestruct->format = format;
131     break;
132   case MAT_CUSPARSE_ALL:
133     cusparsestruct->format = format;
134     break;
135   default:
136     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
137   }
138 #else
139   if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format require CUDA 4.2 or later.");
140 #endif
141   PetscFunctionReturn(0);
142 }
143 
144 /*@
145    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
146    operation. Only the MatMult operation can use different GPU storage formats
147    for MPIAIJCUSPARSE matrices.
148    Not Collective
149 
150    Input Parameters:
151 +  A - Matrix of type SEQAIJCUSPARSE
152 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
153 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
154 
155    Output Parameter:
156 
157    Level: intermediate
158 
159 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
160 @*/
161 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
162 {
163   PetscErrorCode ierr;
164 
165   PetscFunctionBegin;
166   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
167   ierr = PetscTryMethod(A, "MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
168   PetscFunctionReturn(0);
169 }
170 
171 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
172 {
173   PetscErrorCode           ierr;
174   MatCUSPARSEStorageFormat format;
175   PetscBool                flg;
176   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
177 
178   PetscFunctionBegin;
179   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
180   ierr = PetscObjectOptionsBegin((PetscObject)A);
181   if (A->factortype==MAT_FACTOR_NONE) {
182     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
183                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
184     if (flg) {
185       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);
186     }
187   }
188   ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
189                           "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
190   if (flg) {
191     ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
192   }
193   ierr = PetscOptionsEnd();CHKERRQ(ierr);
194   PetscFunctionReturn(0);
195 
196 }
197 
198 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
199 {
200   PetscErrorCode ierr;
201 
202   PetscFunctionBegin;
203   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
204   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
205   PetscFunctionReturn(0);
206 }
207 
208 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
209 {
210   PetscErrorCode ierr;
211 
212   PetscFunctionBegin;
213   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
214   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
215   PetscFunctionReturn(0);
216 }
217 
218 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
219 {
220   PetscErrorCode ierr;
221 
222   PetscFunctionBegin;
223   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
224   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
225   PetscFunctionReturn(0);
226 }
227 
228 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
229 {
230   PetscErrorCode ierr;
231 
232   PetscFunctionBegin;
233   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
234   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
235   PetscFunctionReturn(0);
236 }
237 
238 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
239 {
240   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
241   PetscInt                          n = A->rmap->n;
242   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
243   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
244   cusparseStatus_t                  stat;
245   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
246   const MatScalar                   *aa = a->a,*v;
247   PetscInt                          *AiLo, *AjLo;
248   PetscScalar                       *AALo;
249   PetscInt                          i,nz, nzLower, offset, rowOffset;
250   PetscErrorCode                    ierr;
251 
252   PetscFunctionBegin;
253   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
254     try {
255       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
256       nzLower=n+ai[n]-ai[1];
257 
258       /* Allocate Space for the lower triangular matrix */
259       ierr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
260       ierr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(ierr);
261       ierr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(ierr);
262 
263       /* Fill the lower triangular matrix */
264       AiLo[0]  = (PetscInt) 0;
265       AiLo[n]  = nzLower;
266       AjLo[0]  = (PetscInt) 0;
267       AALo[0]  = (MatScalar) 1.0;
268       v        = aa;
269       vi       = aj;
270       offset   = 1;
271       rowOffset= 1;
272       for (i=1; i<n; i++) {
273         nz = ai[i+1] - ai[i];
274         /* additional 1 for the term on the diagonal */
275         AiLo[i]    = rowOffset;
276         rowOffset += nz+1;
277 
278         ierr = PetscMemcpy(&(AjLo[offset]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr);
279         ierr = PetscMemcpy(&(AALo[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
280 
281         offset      += nz;
282         AjLo[offset] = (PetscInt) i;
283         AALo[offset] = (MatScalar) 1.0;
284         offset      += 1;
285 
286         v  += nz;
287         vi += nz;
288       }
289 
290       /* allocate space for the triangular factor information */
291       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
292 
293       /* Create the matrix description */
294       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat);
295       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
296       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
297       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUDA(stat);
298       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat);
299 
300       /* Create the solve analysis information */
301       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat);
302 
303       /* set the operation */
304       loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
305 
306       /* set the matrix */
307       loTriFactor->csrMat = new CsrMatrix;
308       loTriFactor->csrMat->num_rows = n;
309       loTriFactor->csrMat->num_cols = n;
310       loTriFactor->csrMat->num_entries = nzLower;
311 
312       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
313       loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
314 
315       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
316       loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
317 
318       loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
319       loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
320 
321       /* perform the solve analysis */
322       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
323                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
324                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
325                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat);
326 
327       /* assign the pointer. Is this really necessary? */
328       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
329 
330       ierr = cudaFreeHost(AiLo);CHKERRCUDA(ierr);
331       ierr = cudaFreeHost(AjLo);CHKERRCUDA(ierr);
332       ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr);
333     } catch(char *ex) {
334       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
335     }
336   }
337   PetscFunctionReturn(0);
338 }
339 
340 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
341 {
342   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
343   PetscInt                          n = A->rmap->n;
344   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
345   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
346   cusparseStatus_t                  stat;
347   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
348   const MatScalar                   *aa = a->a,*v;
349   PetscInt                          *AiUp, *AjUp;
350   PetscScalar                       *AAUp;
351   PetscInt                          i,nz, nzUpper, offset;
352   PetscErrorCode                    ierr;
353 
354   PetscFunctionBegin;
355   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
356     try {
357       /* next, figure out the number of nonzeros in the upper triangular matrix. */
358       nzUpper = adiag[0]-adiag[n];
359 
360       /* Allocate Space for the upper triangular matrix */
361       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
362       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr);
363       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
364 
365       /* Fill the upper triangular matrix */
366       AiUp[0]=(PetscInt) 0;
367       AiUp[n]=nzUpper;
368       offset = nzUpper;
369       for (i=n-1; i>=0; i--) {
370         v  = aa + adiag[i+1] + 1;
371         vi = aj + adiag[i+1] + 1;
372 
373         /* number of elements NOT on the diagonal */
374         nz = adiag[i] - adiag[i+1]-1;
375 
376         /* decrement the offset */
377         offset -= (nz+1);
378 
379         /* first, set the diagonal elements */
380         AjUp[offset] = (PetscInt) i;
381         AAUp[offset] = (MatScalar)1./v[nz];
382         AiUp[i]      = AiUp[i+1] - (nz+1);
383 
384         ierr = PetscMemcpy(&(AjUp[offset+1]), vi, nz*sizeof(PetscInt));CHKERRQ(ierr);
385         ierr = PetscMemcpy(&(AAUp[offset+1]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
386       }
387 
388       /* allocate space for the triangular factor information */
389       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
390 
391       /* Create the matrix description */
392       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat);
393       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
394       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
395       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
396       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat);
397 
398       /* Create the solve analysis information */
399       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat);
400 
401       /* set the operation */
402       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
403 
404       /* set the matrix */
405       upTriFactor->csrMat = new CsrMatrix;
406       upTriFactor->csrMat->num_rows = n;
407       upTriFactor->csrMat->num_cols = n;
408       upTriFactor->csrMat->num_entries = nzUpper;
409 
410       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
411       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
412 
413       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
414       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
415 
416       upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
417       upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
418 
419       /* perform the solve analysis */
420       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
421                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
422                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
423                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat);
424 
425       /* assign the pointer. Is this really necessary? */
426       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
427 
428       ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr);
429       ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr);
430       ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr);
431     } catch(char *ex) {
432       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
433     }
434   }
435   PetscFunctionReturn(0);
436 }
437 
438 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
439 {
440   PetscErrorCode               ierr;
441   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
442   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
443   IS                           isrow = a->row,iscol = a->icol;
444   PetscBool                    row_identity,col_identity;
445   const PetscInt               *r,*c;
446   PetscInt                     n = A->rmap->n;
447 
448   PetscFunctionBegin;
449   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
450   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
451 
452   cusparseTriFactors->workVector = new THRUSTARRAY(n);
453   cusparseTriFactors->nnz=a->nz;
454 
455   A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH;
456   /*lower triangular indices */
457   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
458   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
459   if (!row_identity) {
460     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
461     cusparseTriFactors->rpermIndices->assign(r, r+n);
462   }
463   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
464 
465   /*upper triangular indices */
466   ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
467   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
468   if (!col_identity) {
469     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
470     cusparseTriFactors->cpermIndices->assign(c, c+n);
471   }
472   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
473   PetscFunctionReturn(0);
474 }
475 
476 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
477 {
478   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
479   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
480   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
481   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
482   cusparseStatus_t                  stat;
483   PetscErrorCode                    ierr;
484   PetscInt                          *AiUp, *AjUp;
485   PetscScalar                       *AAUp;
486   PetscScalar                       *AALo;
487   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
488   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
489   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
490   const MatScalar                   *aa = b->a,*v;
491 
492   PetscFunctionBegin;
493   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
494     try {
495       /* Allocate Space for the upper triangular matrix */
496       ierr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(ierr);
497       ierr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(ierr);
498       ierr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
499       ierr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(ierr);
500 
501       /* Fill the upper triangular matrix */
502       AiUp[0]=(PetscInt) 0;
503       AiUp[n]=nzUpper;
504       offset = 0;
505       for (i=0; i<n; i++) {
506         /* set the pointers */
507         v  = aa + ai[i];
508         vj = aj + ai[i];
509         nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
510 
511         /* first, set the diagonal elements */
512         AjUp[offset] = (PetscInt) i;
513         AAUp[offset] = (MatScalar)1.0/v[nz];
514         AiUp[i]      = offset;
515         AALo[offset] = (MatScalar)1.0/v[nz];
516 
517         offset+=1;
518         if (nz>0) {
519           ierr = PetscMemcpy(&(AjUp[offset]), vj, nz*sizeof(PetscInt));CHKERRQ(ierr);
520           ierr = PetscMemcpy(&(AAUp[offset]), v, nz*sizeof(PetscScalar));CHKERRQ(ierr);
521           for (j=offset; j<offset+nz; j++) {
522             AAUp[j] = -AAUp[j];
523             AALo[j] = AAUp[j]/v[nz];
524           }
525           offset+=nz;
526         }
527       }
528 
529       /* allocate space for the triangular factor information */
530       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
531 
532       /* Create the matrix description */
533       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUDA(stat);
534       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
535       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
536       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
537       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUDA(stat);
538 
539       /* Create the solve analysis information */
540       stat = cusparseCreateSolveAnalysisInfo(&upTriFactor->solveInfo);CHKERRCUDA(stat);
541 
542       /* set the operation */
543       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
544 
545       /* set the matrix */
546       upTriFactor->csrMat = new CsrMatrix;
547       upTriFactor->csrMat->num_rows = A->rmap->n;
548       upTriFactor->csrMat->num_cols = A->cmap->n;
549       upTriFactor->csrMat->num_entries = a->nz;
550 
551       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
552       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
553 
554       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
555       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
556 
557       upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
558       upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
559 
560       /* perform the solve analysis */
561       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
562                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
563                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
564                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo);CHKERRCUDA(stat);
565 
566       /* assign the pointer. Is this really necessary? */
567       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
568 
569       /* allocate space for the triangular factor information */
570       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
571 
572       /* Create the matrix description */
573       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUDA(stat);
574       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
575       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUDA(stat);
576       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUDA(stat);
577       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUDA(stat);
578 
579       /* Create the solve analysis information */
580       stat = cusparseCreateSolveAnalysisInfo(&loTriFactor->solveInfo);CHKERRCUDA(stat);
581 
582       /* set the operation */
583       loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
584 
585       /* set the matrix */
586       loTriFactor->csrMat = new CsrMatrix;
587       loTriFactor->csrMat->num_rows = A->rmap->n;
588       loTriFactor->csrMat->num_cols = A->cmap->n;
589       loTriFactor->csrMat->num_entries = a->nz;
590 
591       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
592       loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
593 
594       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
595       loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
596 
597       loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
598       loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
599 
600       /* perform the solve analysis */
601       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
602                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
603                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
604                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo);CHKERRCUDA(stat);
605 
606       /* assign the pointer. Is this really necessary? */
607       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
608 
609       A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH;
610       ierr = cudaFreeHost(AiUp);CHKERRCUDA(ierr);
611       ierr = cudaFreeHost(AjUp);CHKERRCUDA(ierr);
612       ierr = cudaFreeHost(AAUp);CHKERRCUDA(ierr);
613       ierr = cudaFreeHost(AALo);CHKERRCUDA(ierr);
614     } catch(char *ex) {
615       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
616     }
617   }
618   PetscFunctionReturn(0);
619 }
620 
621 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
622 {
623   PetscErrorCode               ierr;
624   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
625   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
626   IS                           ip = a->row;
627   const PetscInt               *rip;
628   PetscBool                    perm_identity;
629   PetscInt                     n = A->rmap->n;
630 
631   PetscFunctionBegin;
632   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
633   cusparseTriFactors->workVector = new THRUSTARRAY(n);
634   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
635 
636   /*lower triangular indices */
637   ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
638   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
639   if (!perm_identity) {
640     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
641     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
642     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
643     cusparseTriFactors->cpermIndices->assign(rip, rip+n);
644   }
645   ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
646   PetscFunctionReturn(0);
647 }
648 
649 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
650 {
651   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
652   IS             isrow = b->row,iscol = b->col;
653   PetscBool      row_identity,col_identity;
654   PetscErrorCode ierr;
655 
656   PetscFunctionBegin;
657   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
658   /* determine which version of MatSolve needs to be used. */
659   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
660   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
661   if (row_identity && col_identity) {
662     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
663     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
664   } else {
665     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
666     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
667   }
668 
669   /* get the triangular factors */
670   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
671   PetscFunctionReturn(0);
672 }
673 
674 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
675 {
676   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
677   IS             ip = b->row;
678   PetscBool      perm_identity;
679   PetscErrorCode ierr;
680 
681   PetscFunctionBegin;
682   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
683 
684   /* determine which version of MatSolve needs to be used. */
685   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
686   if (perm_identity) {
687     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
688     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
689   } else {
690     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
691     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
692   }
693 
694   /* get the triangular factors */
695   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
696   PetscFunctionReturn(0);
697 }
698 
699 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
700 {
701   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
702   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
703   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
704   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
705   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
706   cusparseStatus_t                  stat;
707   cusparseIndexBase_t               indexBase;
708   cusparseMatrixType_t              matrixType;
709   cusparseFillMode_t                fillMode;
710   cusparseDiagType_t                diagType;
711 
712   PetscFunctionBegin;
713 
714   /*********************************************/
715   /* Now the Transpose of the Lower Tri Factor */
716   /*********************************************/
717 
718   /* allocate space for the transpose of the lower triangular factor */
719   loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
720 
721   /* set the matrix descriptors of the lower triangular factor */
722   matrixType = cusparseGetMatType(loTriFactor->descr);
723   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
724   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
725     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
726   diagType = cusparseGetMatDiagType(loTriFactor->descr);
727 
728   /* Create the matrix description */
729   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUDA(stat);
730   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUDA(stat);
731   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUDA(stat);
732   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUDA(stat);
733   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUDA(stat);
734 
735   /* Create the solve analysis information */
736   stat = cusparseCreateSolveAnalysisInfo(&loTriFactorT->solveInfo);CHKERRCUDA(stat);
737 
738   /* set the operation */
739   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
740 
741   /* allocate GPU space for the CSC of the lower triangular factor*/
742   loTriFactorT->csrMat = new CsrMatrix;
743   loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_rows;
744   loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_cols;
745   loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries;
746   loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactor->csrMat->num_rows+1);
747   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactor->csrMat->num_entries);
748   loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactor->csrMat->num_entries);
749 
750   /* compute the transpose of the lower triangular factor, i.e. the CSC */
751   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
752                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
753                           loTriFactor->csrMat->values->data().get(),
754                           loTriFactor->csrMat->row_offsets->data().get(),
755                           loTriFactor->csrMat->column_indices->data().get(),
756                           loTriFactorT->csrMat->values->data().get(),
757                           loTriFactorT->csrMat->column_indices->data().get(),
758                           loTriFactorT->csrMat->row_offsets->data().get(),
759                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
760 
761   /* perform the solve analysis on the transposed matrix */
762   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
763                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries,
764                            loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
765                            loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(),
766                            loTriFactorT->solveInfo);CHKERRCUDA(stat);
767 
768   /* assign the pointer. Is this really necessary? */
769   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
770 
771   /*********************************************/
772   /* Now the Transpose of the Upper Tri Factor */
773   /*********************************************/
774 
775   /* allocate space for the transpose of the upper triangular factor */
776   upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
777 
778   /* set the matrix descriptors of the upper triangular factor */
779   matrixType = cusparseGetMatType(upTriFactor->descr);
780   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
781   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
782     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
783   diagType = cusparseGetMatDiagType(upTriFactor->descr);
784 
785   /* Create the matrix description */
786   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUDA(stat);
787   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUDA(stat);
788   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUDA(stat);
789   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUDA(stat);
790   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUDA(stat);
791 
792   /* Create the solve analysis information */
793   stat = cusparseCreateSolveAnalysisInfo(&upTriFactorT->solveInfo);CHKERRCUDA(stat);
794 
795   /* set the operation */
796   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
797 
798   /* allocate GPU space for the CSC of the upper triangular factor*/
799   upTriFactorT->csrMat = new CsrMatrix;
800   upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_rows;
801   upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_cols;
802   upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries;
803   upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactor->csrMat->num_rows+1);
804   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactor->csrMat->num_entries);
805   upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactor->csrMat->num_entries);
806 
807   /* compute the transpose of the upper triangular factor, i.e. the CSC */
808   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
809                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
810                           upTriFactor->csrMat->values->data().get(),
811                           upTriFactor->csrMat->row_offsets->data().get(),
812                           upTriFactor->csrMat->column_indices->data().get(),
813                           upTriFactorT->csrMat->values->data().get(),
814                           upTriFactorT->csrMat->column_indices->data().get(),
815                           upTriFactorT->csrMat->row_offsets->data().get(),
816                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
817 
818   /* perform the solve analysis on the transposed matrix */
819   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
820                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries,
821                            upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
822                            upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(),
823                            upTriFactorT->solveInfo);CHKERRCUDA(stat);
824 
825   /* assign the pointer. Is this really necessary? */
826   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
827   PetscFunctionReturn(0);
828 }
829 
830 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
831 {
832   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
833   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
834   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
835   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
836   cusparseStatus_t             stat;
837   cusparseIndexBase_t          indexBase;
838   cudaError_t                  err;
839 
840   PetscFunctionBegin;
841 
842   /* allocate space for the triangular factor information */
843   matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
844   stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUDA(stat);
845   indexBase = cusparseGetMatIndexBase(matstruct->descr);
846   stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUDA(stat);
847   stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat);
848 
849   /* set alpha and beta */
850   err = cudaMalloc((void **)&(matstructT->alpha),sizeof(PetscScalar));CHKERRCUDA(err);
851   err = cudaMemcpy(matstructT->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
852   err = cudaMalloc((void **)&(matstructT->beta),sizeof(PetscScalar));CHKERRCUDA(err);
853   err = cudaMemcpy(matstructT->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
854   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
855 
856   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
857     CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
858     CsrMatrix *matrixT= new CsrMatrix;
859     matrixT->num_rows = A->cmap->n;
860     matrixT->num_cols = A->rmap->n;
861     matrixT->num_entries = a->nz;
862     matrixT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
863     matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
864     matrixT->values = new THRUSTARRAY(a->nz);
865 
866     /* compute the transpose of the upper triangular factor, i.e. the CSC */
867     indexBase = cusparseGetMatIndexBase(matstruct->descr);
868     stat = cusparse_csr2csc(cusparsestruct->handle, matrix->num_rows,
869                             matrix->num_cols, matrix->num_entries,
870                             matrix->values->data().get(),
871                             matrix->row_offsets->data().get(),
872                             matrix->column_indices->data().get(),
873                             matrixT->values->data().get(),
874                             matrixT->column_indices->data().get(),
875                             matrixT->row_offsets->data().get(),
876                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
877 
878     /* assign the pointer */
879     matstructT->mat = matrixT;
880 
881   } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
882 #if CUDA_VERSION>=5000
883     /* First convert HYB to CSR */
884     CsrMatrix *temp= new CsrMatrix;
885     temp->num_rows = A->rmap->n;
886     temp->num_cols = A->cmap->n;
887     temp->num_entries = a->nz;
888     temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
889     temp->column_indices = new THRUSTINTARRAY32(a->nz);
890     temp->values = new THRUSTARRAY(a->nz);
891 
892 
893     stat = cusparse_hyb2csr(cusparsestruct->handle,
894                             matstruct->descr, (cusparseHybMat_t)matstruct->mat,
895                             temp->values->data().get(),
896                             temp->row_offsets->data().get(),
897                             temp->column_indices->data().get());CHKERRCUDA(stat);
898 
899     /* Next, convert CSR to CSC (i.e. the matrix transpose) */
900     CsrMatrix *tempT= new CsrMatrix;
901     tempT->num_rows = A->rmap->n;
902     tempT->num_cols = A->cmap->n;
903     tempT->num_entries = a->nz;
904     tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
905     tempT->column_indices = new THRUSTINTARRAY32(a->nz);
906     tempT->values = new THRUSTARRAY(a->nz);
907 
908     stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
909                             temp->num_cols, temp->num_entries,
910                             temp->values->data().get(),
911                             temp->row_offsets->data().get(),
912                             temp->column_indices->data().get(),
913                             tempT->values->data().get(),
914                             tempT->column_indices->data().get(),
915                             tempT->row_offsets->data().get(),
916                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUDA(stat);
917 
918     /* Last, convert CSC to HYB */
919     cusparseHybMat_t hybMat;
920     stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat);
921     cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
922       CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
923     stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
924                             matstructT->descr, tempT->values->data().get(),
925                             tempT->row_offsets->data().get(),
926                             tempT->column_indices->data().get(),
927                             hybMat, 0, partition);CHKERRCUDA(stat);
928 
929     /* assign the pointer */
930     matstructT->mat = hybMat;
931 
932     /* delete temporaries */
933     if (tempT) {
934       if (tempT->values) delete (THRUSTARRAY*) tempT->values;
935       if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
936       if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
937       delete (CsrMatrix*) tempT;
938     }
939     if (temp) {
940       if (temp->values) delete (THRUSTARRAY*) temp->values;
941       if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
942       if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
943       delete (CsrMatrix*) temp;
944     }
945 #else
946     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"ELL (Ellpack) and HYB (Hybrid) storage format for the Matrix Transpose (in MatMultTranspose) require CUDA 5.0 or later.");
947 #endif
948   }
949   /* assign the compressed row indices */
950   matstructT->cprowIndices = new THRUSTINTARRAY;
951   matstructT->cprowIndices->resize(A->cmap->n);
952   thrust::sequence(matstructT->cprowIndices->begin(), matstructT->cprowIndices->end());
953 
954   /* assign the pointer */
955   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
956   PetscFunctionReturn(0);
957 }
958 
959 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
960 {
961   PetscInt                              n = xx->map->n;
962   const PetscScalar                     *barray;
963   PetscScalar                           *xarray;
964   thrust::device_ptr<const PetscScalar> bGPU;
965   thrust::device_ptr<PetscScalar>       xGPU;
966   cusparseStatus_t                      stat;
967   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
968   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
969   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
970   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
971   PetscErrorCode                        ierr;
972 
973   PetscFunctionBegin;
974   /* Analyze the matrix and create the transpose ... on the fly */
975   if (!loTriFactorT && !upTriFactorT) {
976     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
977     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
978     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
979   }
980 
981   /* Get the GPU pointers */
982   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
983   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
984   xGPU = thrust::device_pointer_cast(xarray);
985   bGPU = thrust::device_pointer_cast(barray);
986 
987   /* First, reorder with the row permutation */
988   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
989                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
990                xGPU);
991 
992   /* First, solve U */
993   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
994                         upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr,
995                         upTriFactorT->csrMat->values->data().get(),
996                         upTriFactorT->csrMat->row_offsets->data().get(),
997                         upTriFactorT->csrMat->column_indices->data().get(),
998                         upTriFactorT->solveInfo,
999                         xarray, tempGPU->data().get());CHKERRCUDA(stat);
1000 
1001   /* Then, solve L */
1002   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1003                         loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr,
1004                         loTriFactorT->csrMat->values->data().get(),
1005                         loTriFactorT->csrMat->row_offsets->data().get(),
1006                         loTriFactorT->csrMat->column_indices->data().get(),
1007                         loTriFactorT->solveInfo,
1008                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1009 
1010   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1011   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1012                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1013                tempGPU->begin());
1014 
1015   /* Copy the temporary to the full solution. */
1016   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1017 
1018   /* restore */
1019   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1020   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1021   ierr = WaitForGPU();CHKERRCUDA(ierr);
1022 
1023   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1024   PetscFunctionReturn(0);
1025 }
1026 
1027 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1028 {
1029   const PetscScalar                 *barray;
1030   PetscScalar                       *xarray;
1031   cusparseStatus_t                  stat;
1032   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1033   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1034   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1035   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1036   PetscErrorCode                    ierr;
1037 
1038   PetscFunctionBegin;
1039   /* Analyze the matrix and create the transpose ... on the fly */
1040   if (!loTriFactorT && !upTriFactorT) {
1041     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1042     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1043     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1044   }
1045 
1046   /* Get the GPU pointers */
1047   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1048   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1049 
1050   /* First, solve U */
1051   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1052                         upTriFactorT->csrMat->num_rows, &ALPHA, upTriFactorT->descr,
1053                         upTriFactorT->csrMat->values->data().get(),
1054                         upTriFactorT->csrMat->row_offsets->data().get(),
1055                         upTriFactorT->csrMat->column_indices->data().get(),
1056                         upTriFactorT->solveInfo,
1057                         barray, tempGPU->data().get());CHKERRCUDA(stat);
1058 
1059   /* Then, solve L */
1060   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1061                         loTriFactorT->csrMat->num_rows, &ALPHA, loTriFactorT->descr,
1062                         loTriFactorT->csrMat->values->data().get(),
1063                         loTriFactorT->csrMat->row_offsets->data().get(),
1064                         loTriFactorT->csrMat->column_indices->data().get(),
1065                         loTriFactorT->solveInfo,
1066                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1067 
1068   /* restore */
1069   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1070   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1071   ierr = WaitForGPU();CHKERRCUDA(ierr);
1072   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1073   PetscFunctionReturn(0);
1074 }
1075 
1076 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1077 {
1078   const PetscScalar                     *barray;
1079   PetscScalar                           *xarray;
1080   thrust::device_ptr<const PetscScalar> bGPU;
1081   thrust::device_ptr<PetscScalar>       xGPU;
1082   cusparseStatus_t                      stat;
1083   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1084   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1085   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1086   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1087   PetscErrorCode                        ierr;
1088 
1089   PetscFunctionBegin;
1090 
1091   /* Get the GPU pointers */
1092   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1093   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1094   xGPU = thrust::device_pointer_cast(xarray);
1095   bGPU = thrust::device_pointer_cast(barray);
1096 
1097   /* First, reorder with the row permutation */
1098   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1099                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1100                xGPU);
1101 
1102   /* Next, solve L */
1103   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1104                         loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr,
1105                         loTriFactor->csrMat->values->data().get(),
1106                         loTriFactor->csrMat->row_offsets->data().get(),
1107                         loTriFactor->csrMat->column_indices->data().get(),
1108                         loTriFactor->solveInfo,
1109                         xarray, tempGPU->data().get());CHKERRCUDA(stat);
1110 
1111   /* Then, solve U */
1112   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1113                         upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr,
1114                         upTriFactor->csrMat->values->data().get(),
1115                         upTriFactor->csrMat->row_offsets->data().get(),
1116                         upTriFactor->csrMat->column_indices->data().get(),
1117                         upTriFactor->solveInfo,
1118                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1119 
1120   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1121   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1122                thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->end()),
1123                tempGPU->begin());
1124 
1125   /* Copy the temporary to the full solution. */
1126   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1127 
1128   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1129   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1130   ierr = WaitForGPU();CHKERRCUDA(ierr);
1131   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1132   PetscFunctionReturn(0);
1133 }
1134 
1135 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1136 {
1137   const PetscScalar                 *barray;
1138   PetscScalar                       *xarray;
1139   cusparseStatus_t                  stat;
1140   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1141   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1142   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1143   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1144   PetscErrorCode                    ierr;
1145 
1146   PetscFunctionBegin;
1147   /* Get the GPU pointers */
1148   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1149   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1150 
1151   /* First, solve L */
1152   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1153                         loTriFactor->csrMat->num_rows, &ALPHA, loTriFactor->descr,
1154                         loTriFactor->csrMat->values->data().get(),
1155                         loTriFactor->csrMat->row_offsets->data().get(),
1156                         loTriFactor->csrMat->column_indices->data().get(),
1157                         loTriFactor->solveInfo,
1158                         barray, tempGPU->data().get());CHKERRCUDA(stat);
1159 
1160   /* Next, solve U */
1161   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1162                         upTriFactor->csrMat->num_rows, &ALPHA, upTriFactor->descr,
1163                         upTriFactor->csrMat->values->data().get(),
1164                         upTriFactor->csrMat->row_offsets->data().get(),
1165                         upTriFactor->csrMat->column_indices->data().get(),
1166                         upTriFactor->solveInfo,
1167                         tempGPU->data().get(), xarray);CHKERRCUDA(stat);
1168 
1169   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1170   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1171   ierr = WaitForGPU();CHKERRCUDA(ierr);
1172   ierr = PetscLogFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1173   PetscFunctionReturn(0);
1174 }
1175 
1176 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1177 {
1178 
1179   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1180   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1181   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1182   PetscInt                     m = A->rmap->n,*ii,*ridx;
1183   PetscErrorCode               ierr;
1184   cusparseStatus_t             stat;
1185   cudaError_t                  err;
1186 
1187   PetscFunctionBegin;
1188   if (A->valid_GPU_matrix == PETSC_OFFLOAD_UNALLOCATED || A->valid_GPU_matrix == PETSC_OFFLOAD_CPU) {
1189     ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1190     if (A->assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) {
1191       CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
1192       /* copy values only */
1193       matrix->values->assign(a->a, a->a+a->nz);
1194     } else {
1195       MatSeqAIJCUSPARSEMultStruct_Destroy(&matstruct,cusparsestruct->format);
1196       try {
1197         cusparsestruct->nonzerorow=0;
1198         for (int j = 0; j<m; j++) cusparsestruct->nonzerorow += ((a->i[j+1]-a->i[j])>0);
1199 
1200         if (a->compressedrow.use) {
1201           m    = a->compressedrow.nrows;
1202           ii   = a->compressedrow.i;
1203           ridx = a->compressedrow.rindex;
1204         } else {
1205           /* Forcing compressed row on the GPU */
1206           int k=0;
1207           ierr = PetscMalloc1(cusparsestruct->nonzerorow+1, &ii);CHKERRQ(ierr);
1208           ierr = PetscMalloc1(cusparsestruct->nonzerorow, &ridx);CHKERRQ(ierr);
1209           ii[0]=0;
1210           for (int j = 0; j<m; j++) {
1211             if ((a->i[j+1]-a->i[j])>0) {
1212               ii[k]  = a->i[j];
1213               ridx[k]= j;
1214               k++;
1215             }
1216           }
1217           ii[cusparsestruct->nonzerorow] = a->nz;
1218           m = cusparsestruct->nonzerorow;
1219         }
1220 
1221         /* allocate space for the triangular factor information */
1222         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1223         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUDA(stat);
1224         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUDA(stat);
1225         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUDA(stat);
1226 
1227         err = cudaMalloc((void **)&(matstruct->alpha),sizeof(PetscScalar));CHKERRCUDA(err);
1228         err = cudaMemcpy(matstruct->alpha,&ALPHA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1229         err = cudaMalloc((void **)&(matstruct->beta),sizeof(PetscScalar));CHKERRCUDA(err);
1230         err = cudaMemcpy(matstruct->beta,&BETA,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1231         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUDA(stat);
1232 
1233         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1234         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1235           /* set the matrix */
1236           CsrMatrix *matrix= new CsrMatrix;
1237           matrix->num_rows = m;
1238           matrix->num_cols = A->cmap->n;
1239           matrix->num_entries = a->nz;
1240           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1241           matrix->row_offsets->assign(ii, ii + m+1);
1242 
1243           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1244           matrix->column_indices->assign(a->j, a->j+a->nz);
1245 
1246           matrix->values = new THRUSTARRAY(a->nz);
1247           matrix->values->assign(a->a, a->a+a->nz);
1248 
1249           /* assign the pointer */
1250           matstruct->mat = matrix;
1251 
1252         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1253 #if CUDA_VERSION>=4020
1254           CsrMatrix *matrix= new CsrMatrix;
1255           matrix->num_rows = m;
1256           matrix->num_cols = A->cmap->n;
1257           matrix->num_entries = a->nz;
1258           matrix->row_offsets = new THRUSTINTARRAY32(m+1);
1259           matrix->row_offsets->assign(ii, ii + m+1);
1260 
1261           matrix->column_indices = new THRUSTINTARRAY32(a->nz);
1262           matrix->column_indices->assign(a->j, a->j+a->nz);
1263 
1264           matrix->values = new THRUSTARRAY(a->nz);
1265           matrix->values->assign(a->a, a->a+a->nz);
1266 
1267           cusparseHybMat_t hybMat;
1268           stat = cusparseCreateHybMat(&hybMat);CHKERRCUDA(stat);
1269           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1270             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1271           stat = cusparse_csr2hyb(cusparsestruct->handle, matrix->num_rows, matrix->num_cols,
1272               matstruct->descr, matrix->values->data().get(),
1273               matrix->row_offsets->data().get(),
1274               matrix->column_indices->data().get(),
1275               hybMat, 0, partition);CHKERRCUDA(stat);
1276           /* assign the pointer */
1277           matstruct->mat = hybMat;
1278 
1279           if (matrix) {
1280             if (matrix->values) delete (THRUSTARRAY*)matrix->values;
1281             if (matrix->column_indices) delete (THRUSTINTARRAY32*)matrix->column_indices;
1282             if (matrix->row_offsets) delete (THRUSTINTARRAY32*)matrix->row_offsets;
1283             delete (CsrMatrix*)matrix;
1284           }
1285 #endif
1286         }
1287 
1288         /* assign the compressed row indices */
1289         matstruct->cprowIndices = new THRUSTINTARRAY(m);
1290         matstruct->cprowIndices->assign(ridx,ridx+m);
1291 
1292         /* assign the pointer */
1293         cusparsestruct->mat = matstruct;
1294 
1295         if (!a->compressedrow.use) {
1296           ierr = PetscFree(ii);CHKERRQ(ierr);
1297           ierr = PetscFree(ridx);CHKERRQ(ierr);
1298         }
1299         cusparsestruct->workVector = new THRUSTARRAY(m);
1300       } catch(char *ex) {
1301         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1302       }
1303       cusparsestruct->nonzerostate = A->nonzerostate;
1304     }
1305     ierr = WaitForGPU();CHKERRCUDA(ierr);
1306     A->valid_GPU_matrix = PETSC_OFFLOAD_BOTH;
1307     ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1308   }
1309   PetscFunctionReturn(0);
1310 }
1311 
1312 static PetscErrorCode MatCreateVecs_SeqAIJCUSPARSE(Mat mat, Vec *right, Vec *left)
1313 {
1314   PetscErrorCode ierr;
1315   PetscInt rbs,cbs;
1316 
1317   PetscFunctionBegin;
1318   ierr = MatGetBlockSizes(mat,&rbs,&cbs);CHKERRQ(ierr);
1319   if (right) {
1320     ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr);
1321     ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
1322     ierr = VecSetBlockSize(*right,cbs);CHKERRQ(ierr);
1323     ierr = VecSetType(*right,VECSEQCUDA);CHKERRQ(ierr);
1324     ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr);
1325   }
1326   if (left) {
1327     ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr);
1328     ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
1329     ierr = VecSetBlockSize(*left,rbs);CHKERRQ(ierr);
1330     ierr = VecSetType(*left,VECSEQCUDA);CHKERRQ(ierr);
1331     ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr);
1332   }
1333   PetscFunctionReturn(0);
1334 }
1335 
1336 struct VecCUDAPlusEquals
1337 {
1338   template <typename Tuple>
1339   __host__ __device__
1340   void operator()(Tuple t)
1341   {
1342     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1343   }
1344 };
1345 
1346 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1347 {
1348   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1349   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1350   Mat_SeqAIJCUSPARSEMultStruct *matstruct; /* Do not initialize yet, because GPU data may not be available yet */
1351   const PetscScalar            *xarray;
1352   PetscScalar                  *yarray;
1353   PetscErrorCode               ierr;
1354   cusparseStatus_t             stat;
1355 
1356   PetscFunctionBegin;
1357   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1358   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1359   ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1360   ierr = VecSet(yy,0);CHKERRQ(ierr);
1361   ierr = VecCUDAGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
1362   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1363   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1364     CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1365     stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1366                              mat->num_rows, mat->num_cols, mat->num_entries,
1367                              matstruct->alpha, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(),
1368                              mat->column_indices->data().get(), xarray, matstruct->beta,
1369                              yarray);CHKERRCUDA(stat);
1370   } else {
1371 #if CUDA_VERSION>=4020
1372     cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1373     stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1374                              matstruct->alpha, matstruct->descr, hybMat,
1375                              xarray, matstruct->beta,
1376                              yarray);CHKERRCUDA(stat);
1377 #endif
1378   }
1379   ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1380   ierr = VecCUDARestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
1381   if (!cusparsestruct->stream) {
1382     ierr = WaitForGPU();CHKERRCUDA(ierr);
1383   }
1384   ierr = PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr);
1385   PetscFunctionReturn(0);
1386 }
1387 
1388 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
1389 {
1390   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1391   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1392   Mat_SeqAIJCUSPARSEMultStruct *matstructT;
1393   const PetscScalar            *xarray;
1394   PetscScalar                  *yarray;
1395   PetscErrorCode               ierr;
1396   cusparseStatus_t             stat;
1397 
1398   PetscFunctionBegin;
1399   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1400   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1401   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1402   if (!matstructT) {
1403     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1404     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1405   }
1406   ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1407   ierr = VecSet(yy,0);CHKERRQ(ierr);
1408   ierr = VecCUDAGetArrayWrite(yy,&yarray);CHKERRQ(ierr);
1409 
1410   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1411     CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1412     stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1413                              mat->num_rows, mat->num_cols,
1414                              mat->num_entries, matstructT->alpha, matstructT->descr,
1415                              mat->values->data().get(), mat->row_offsets->data().get(),
1416                              mat->column_indices->data().get(), xarray, matstructT->beta,
1417                              yarray);CHKERRCUDA(stat);
1418   } else {
1419 #if CUDA_VERSION>=4020
1420     cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1421     stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1422                              matstructT->alpha, matstructT->descr, hybMat,
1423                              xarray, matstructT->beta,
1424                              yarray);CHKERRCUDA(stat);
1425 #endif
1426   }
1427   ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1428   ierr = VecCUDARestoreArrayWrite(yy,&yarray);CHKERRQ(ierr);
1429   if (!cusparsestruct->stream) {
1430     ierr = WaitForGPU();CHKERRCUDA(ierr);
1431   }
1432   ierr = PetscLogFlops(2.0*a->nz - cusparsestruct->nonzerorow);CHKERRQ(ierr);
1433   PetscFunctionReturn(0);
1434 }
1435 
1436 
1437 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1438 {
1439   Mat_SeqAIJ                      *a = (Mat_SeqAIJ*)A->data;
1440   Mat_SeqAIJCUSPARSE              *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1441   Mat_SeqAIJCUSPARSEMultStruct    *matstruct;
1442   thrust::device_ptr<PetscScalar> zptr;
1443   const PetscScalar               *xarray;
1444   PetscScalar                     *zarray;
1445   PetscErrorCode                  ierr;
1446   cusparseStatus_t                stat;
1447 
1448   PetscFunctionBegin;
1449   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1450   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1451   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1452   try {
1453     ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);
1454     ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1455     ierr = VecCUDAGetArrayReadWrite(zz,&zarray);CHKERRQ(ierr);
1456     zptr = thrust::device_pointer_cast(zarray);
1457 
1458     /* multiply add */
1459     if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1460       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
1461     /* here we need to be careful to set the number of rows in the multiply to the
1462        number of compressed rows in the matrix ... which is equivalent to the
1463        size of the workVector */
1464       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1465                                mat->num_rows, mat->num_cols,
1466                                mat->num_entries, matstruct->alpha, matstruct->descr,
1467                                mat->values->data().get(), mat->row_offsets->data().get(),
1468                                mat->column_indices->data().get(), xarray, matstruct->beta,
1469                                cusparsestruct->workVector->data().get());CHKERRCUDA(stat);
1470     } else {
1471 #if CUDA_VERSION>=4020
1472       cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
1473       if (cusparsestruct->workVector->size()) {
1474         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1475             matstruct->alpha, matstruct->descr, hybMat,
1476             xarray, matstruct->beta,
1477             cusparsestruct->workVector->data().get());CHKERRCUDA(stat);
1478       }
1479 #endif
1480     }
1481 
1482     /* scatter the data from the temporary into the full vector with a += operation */
1483     thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
1484         thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + cusparsestruct->workVector->size(),
1485         VecCUDAPlusEquals());
1486     ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1487     ierr = VecCUDARestoreArrayReadWrite(zz,&zarray);CHKERRQ(ierr);
1488 
1489   } catch(char *ex) {
1490     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1491   }
1492   ierr = WaitForGPU();CHKERRCUDA(ierr);
1493   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1494   PetscFunctionReturn(0);
1495 }
1496 
1497 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
1498 {
1499   Mat_SeqAIJ                      *a = (Mat_SeqAIJ*)A->data;
1500   Mat_SeqAIJCUSPARSE              *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1501   Mat_SeqAIJCUSPARSEMultStruct    *matstructT;
1502   thrust::device_ptr<PetscScalar> zptr;
1503   const PetscScalar               *xarray;
1504   PetscScalar                     *zarray;
1505   PetscErrorCode                  ierr;
1506   cusparseStatus_t                stat;
1507 
1508   PetscFunctionBegin;
1509   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
1510   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1511   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1512   if (!matstructT) {
1513     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1514     matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1515   }
1516 
1517   try {
1518     ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);
1519     ierr = VecCUDAGetArrayRead(xx,&xarray);CHKERRQ(ierr);
1520     ierr = VecCUDAGetArrayReadWrite(zz,&zarray);CHKERRQ(ierr);
1521     zptr = thrust::device_pointer_cast(zarray);
1522 
1523     /* multiply add with matrix transpose */
1524     if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1525       CsrMatrix *mat = (CsrMatrix*)matstructT->mat;
1526       /* here we need to be careful to set the number of rows in the multiply to the
1527          number of compressed rows in the matrix ... which is equivalent to the
1528          size of the workVector */
1529       stat = cusparse_csr_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1530                                mat->num_rows, mat->num_cols,
1531                                mat->num_entries, matstructT->alpha, matstructT->descr,
1532                                mat->values->data().get(), mat->row_offsets->data().get(),
1533                                mat->column_indices->data().get(), xarray, matstructT->beta,
1534                                cusparsestruct->workVector->data().get());CHKERRCUDA(stat);
1535     } else {
1536 #if CUDA_VERSION>=4020
1537       cusparseHybMat_t hybMat = (cusparseHybMat_t)matstructT->mat;
1538       if (cusparsestruct->workVector->size()) {
1539         stat = cusparse_hyb_spmv(cusparsestruct->handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1540             matstructT->alpha, matstructT->descr, hybMat,
1541             xarray, matstructT->beta,
1542             cusparsestruct->workVector->data().get());CHKERRCUDA(stat);
1543       }
1544 #endif
1545     }
1546 
1547     /* scatter the data from the temporary into the full vector with a += operation */
1548     thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))),
1549         thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstructT->cprowIndices->begin()))) + A->cmap->n,
1550         VecCUDAPlusEquals());
1551 
1552     ierr = VecCUDARestoreArrayRead(xx,&xarray);CHKERRQ(ierr);
1553     ierr = VecCUDARestoreArrayReadWrite(zz,&zarray);CHKERRQ(ierr);
1554 
1555   } catch(char *ex) {
1556     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1557   }
1558   ierr = WaitForGPU();CHKERRCUDA(ierr);
1559   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1560   PetscFunctionReturn(0);
1561 }
1562 
1563 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
1564 {
1565   PetscErrorCode ierr;
1566 
1567   PetscFunctionBegin;
1568   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
1569   if (A->factortype==MAT_FACTOR_NONE) {
1570     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1571   }
1572   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
1573   A->ops->mult             = MatMult_SeqAIJCUSPARSE;
1574   A->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1575   A->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1576   A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1577   PetscFunctionReturn(0);
1578 }
1579 
1580 /* --------------------------------------------------------------------------------*/
1581 /*@
1582    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
1583    (the default parallel PETSc format). This matrix will ultimately pushed down
1584    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
1585    assembly performance the user should preallocate the matrix storage by setting
1586    the parameter nz (or the array nnz).  By setting these parameters accurately,
1587    performance during matrix assembly can be increased by more than a factor of 50.
1588 
1589    Collective on MPI_Comm
1590 
1591    Input Parameters:
1592 +  comm - MPI communicator, set to PETSC_COMM_SELF
1593 .  m - number of rows
1594 .  n - number of columns
1595 .  nz - number of nonzeros per row (same for all rows)
1596 -  nnz - array containing the number of nonzeros in the various rows
1597          (possibly different for each row) or NULL
1598 
1599    Output Parameter:
1600 .  A - the matrix
1601 
1602    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1603    MatXXXXSetPreallocation() paradgm instead of this routine directly.
1604    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
1605 
1606    Notes:
1607    If nnz is given then nz is ignored
1608 
1609    The AIJ format (also called the Yale sparse matrix format or
1610    compressed row storage), is fully compatible with standard Fortran 77
1611    storage.  That is, the stored row and column indices can begin at
1612    either one (as in Fortran) or zero.  See the users' manual for details.
1613 
1614    Specify the preallocated storage with either nz or nnz (not both).
1615    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1616    allocation.  For large problems you MUST preallocate memory or you
1617    will get TERRIBLE performance, see the users' manual chapter on matrices.
1618 
1619    By default, this format uses inodes (identical nodes) when possible, to
1620    improve numerical efficiency of matrix-vector products and solves. We
1621    search for consecutive rows with the same nonzero structure, thereby
1622    reusing matrix information to achieve increased efficiency.
1623 
1624    Level: intermediate
1625 
1626 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
1627 @*/
1628 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
1629 {
1630   PetscErrorCode ierr;
1631 
1632   PetscFunctionBegin;
1633   ierr = MatCreate(comm,A);CHKERRQ(ierr);
1634   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
1635   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1636   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
1637   PetscFunctionReturn(0);
1638 }
1639 
1640 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
1641 {
1642   PetscErrorCode   ierr;
1643 
1644   PetscFunctionBegin;
1645   if (A->factortype==MAT_FACTOR_NONE) {
1646     if (A->valid_GPU_matrix != PETSC_OFFLOAD_UNALLOCATED) {
1647       ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
1648     }
1649   } else {
1650     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
1651   }
1652   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
1653   PetscFunctionReturn(0);
1654 }
1655 
1656 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
1657 {
1658   PetscErrorCode ierr;
1659   Mat C;
1660   cusparseStatus_t stat;
1661   cusparseHandle_t handle=0;
1662 
1663   PetscFunctionBegin;
1664   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
1665   C = *B;
1666   /* inject CUSPARSE-specific stuff */
1667   if (C->factortype==MAT_FACTOR_NONE) {
1668     /* you cannot check the inode.use flag here since the matrix was just created.
1669        now build a GPU matrix data structure */
1670     C->spptr = new Mat_SeqAIJCUSPARSE;
1671     ((Mat_SeqAIJCUSPARSE*)C->spptr)->mat          = 0;
1672     ((Mat_SeqAIJCUSPARSE*)C->spptr)->matTranspose = 0;
1673     ((Mat_SeqAIJCUSPARSE*)C->spptr)->workVector   = 0;
1674     ((Mat_SeqAIJCUSPARSE*)C->spptr)->format       = MAT_CUSPARSE_CSR;
1675     ((Mat_SeqAIJCUSPARSE*)C->spptr)->stream       = 0;
1676     ((Mat_SeqAIJCUSPARSE*)C->spptr)->handle       = 0;
1677     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1678     ((Mat_SeqAIJCUSPARSE*)C->spptr)->handle       = handle;
1679     ((Mat_SeqAIJCUSPARSE*)C->spptr)->stream       = 0;
1680     ((Mat_SeqAIJCUSPARSE*)C->spptr)->nonzerostate = 0;
1681   } else {
1682     /* NEXT, set the pointers to the triangular factors */
1683     C->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1684     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtr          = 0;
1685     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtr          = 0;
1686     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->loTriFactorPtrTranspose = 0;
1687     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->upTriFactorPtrTranspose = 0;
1688     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->rpermIndices            = 0;
1689     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->cpermIndices            = 0;
1690     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->workVector              = 0;
1691     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle                  = 0;
1692     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1693     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->handle                  = handle;
1694     ((Mat_SeqAIJCUSPARSETriFactors*)C->spptr)->nnz                     = 0;
1695   }
1696 
1697   C->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
1698   C->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
1699   C->ops->getvecs          = MatCreateVecs_SeqAIJCUSPARSE;
1700   C->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
1701   C->ops->mult             = MatMult_SeqAIJCUSPARSE;
1702   C->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1703   C->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1704   C->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1705   C->ops->duplicate        = MatDuplicate_SeqAIJCUSPARSE;
1706 
1707   ierr = PetscObjectChangeTypeName((PetscObject)C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1708 
1709   C->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED;
1710 
1711   ierr = PetscObjectComposeFunction((PetscObject)C, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
1712   PetscFunctionReturn(0);
1713 }
1714 
1715 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
1716 {
1717   PetscErrorCode ierr;
1718   cusparseStatus_t stat;
1719   cusparseHandle_t handle=0;
1720 
1721   PetscFunctionBegin;
1722   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
1723   if (B->factortype==MAT_FACTOR_NONE) {
1724     /* you cannot check the inode.use flag here since the matrix was just created.
1725        now build a GPU matrix data structure */
1726     B->spptr = new Mat_SeqAIJCUSPARSE;
1727     ((Mat_SeqAIJCUSPARSE*)B->spptr)->mat          = 0;
1728     ((Mat_SeqAIJCUSPARSE*)B->spptr)->matTranspose = 0;
1729     ((Mat_SeqAIJCUSPARSE*)B->spptr)->workVector   = 0;
1730     ((Mat_SeqAIJCUSPARSE*)B->spptr)->format       = MAT_CUSPARSE_CSR;
1731     ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream       = 0;
1732     ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle       = 0;
1733     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1734     ((Mat_SeqAIJCUSPARSE*)B->spptr)->handle       = handle;
1735     ((Mat_SeqAIJCUSPARSE*)B->spptr)->stream       = 0;
1736     ((Mat_SeqAIJCUSPARSE*)B->spptr)->nonzerostate = 0;
1737   } else {
1738     /* NEXT, set the pointers to the triangular factors */
1739     B->spptr = new Mat_SeqAIJCUSPARSETriFactors;
1740     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtr          = 0;
1741     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtr          = 0;
1742     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->loTriFactorPtrTranspose = 0;
1743     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->upTriFactorPtrTranspose = 0;
1744     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->rpermIndices            = 0;
1745     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->cpermIndices            = 0;
1746     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->workVector              = 0;
1747     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle                  = 0;
1748     stat = cusparseCreate(&handle);CHKERRCUDA(stat);
1749     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->handle                  = handle;
1750     ((Mat_SeqAIJCUSPARSETriFactors*)B->spptr)->nnz                     = 0;
1751   }
1752 
1753   B->ops->assemblyend      = MatAssemblyEnd_SeqAIJCUSPARSE;
1754   B->ops->destroy          = MatDestroy_SeqAIJCUSPARSE;
1755   B->ops->getvecs          = MatCreateVecs_SeqAIJCUSPARSE;
1756   B->ops->setfromoptions   = MatSetFromOptions_SeqAIJCUSPARSE;
1757   B->ops->mult             = MatMult_SeqAIJCUSPARSE;
1758   B->ops->multadd          = MatMultAdd_SeqAIJCUSPARSE;
1759   B->ops->multtranspose    = MatMultTranspose_SeqAIJCUSPARSE;
1760   B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE;
1761   B->ops->duplicate        = MatDuplicate_SeqAIJCUSPARSE;
1762 
1763   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1764 
1765   B->valid_GPU_matrix = PETSC_OFFLOAD_UNALLOCATED;
1766 
1767   ierr = PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
1768   PetscFunctionReturn(0);
1769 }
1770 
1771 /*MC
1772    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
1773 
1774    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
1775    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
1776    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
1777 
1778    Options Database Keys:
1779 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
1780 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1781 .  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
1782 
1783   Level: beginner
1784 
1785 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
1786 M*/
1787 
1788 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);
1789 
1790 
1791 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
1792 {
1793   PetscErrorCode ierr;
1794 
1795   PetscFunctionBegin;
1796   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1797   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1798   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1799   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
1800   PetscFunctionReturn(0);
1801 }
1802 
1803 
1804 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
1805 {
1806   cusparseStatus_t stat;
1807   cusparseHandle_t handle;
1808 
1809   PetscFunctionBegin;
1810   if (*cusparsestruct) {
1811     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);
1812     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);
1813     delete (*cusparsestruct)->workVector;
1814     if (handle = (*cusparsestruct)->handle) {
1815       stat = cusparseDestroy(handle);CHKERRCUDA(stat);
1816     }
1817     delete *cusparsestruct;
1818     *cusparsestruct = 0;
1819   }
1820   PetscFunctionReturn(0);
1821 }
1822 
1823 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
1824 {
1825   PetscFunctionBegin;
1826   if (*mat) {
1827     delete (*mat)->values;
1828     delete (*mat)->column_indices;
1829     delete (*mat)->row_offsets;
1830     delete *mat;
1831     *mat = 0;
1832   }
1833   PetscFunctionReturn(0);
1834 }
1835 
1836 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
1837 {
1838   cusparseStatus_t stat;
1839   PetscErrorCode   ierr;
1840 
1841   PetscFunctionBegin;
1842   if (*trifactor) {
1843     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUDA(stat); }
1844     if ((*trifactor)->solveInfo) { stat = cusparseDestroySolveAnalysisInfo((*trifactor)->solveInfo);CHKERRCUDA(stat); }
1845     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
1846     delete *trifactor;
1847     *trifactor = 0;
1848   }
1849   PetscFunctionReturn(0);
1850 }
1851 
1852 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
1853 {
1854   CsrMatrix        *mat;
1855   cusparseStatus_t stat;
1856   cudaError_t      err;
1857 
1858   PetscFunctionBegin;
1859   if (*matstruct) {
1860     if ((*matstruct)->mat) {
1861       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
1862         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
1863         stat = cusparseDestroyHybMat(hybMat);CHKERRCUDA(stat);
1864       } else {
1865         mat = (CsrMatrix*)(*matstruct)->mat;
1866         CsrMatrix_Destroy(&mat);
1867       }
1868     }
1869     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUDA(stat); }
1870     delete (*matstruct)->cprowIndices;
1871     if ((*matstruct)->alpha) { err=cudaFree((*matstruct)->alpha);CHKERRCUDA(err); }
1872     if ((*matstruct)->beta) { err=cudaFree((*matstruct)->beta);CHKERRCUDA(err); }
1873     delete *matstruct;
1874     *matstruct = 0;
1875   }
1876   PetscFunctionReturn(0);
1877 }
1878 
1879 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
1880 {
1881   cusparseHandle_t handle;
1882   cusparseStatus_t stat;
1883 
1884   PetscFunctionBegin;
1885   if (*trifactors) {
1886     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);
1887     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);
1888     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);
1889     MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);
1890     delete (*trifactors)->rpermIndices;
1891     delete (*trifactors)->cpermIndices;
1892     delete (*trifactors)->workVector;
1893     if (handle = (*trifactors)->handle) {
1894       stat = cusparseDestroy(handle);CHKERRCUDA(stat);
1895     }
1896     delete *trifactors;
1897     *trifactors = 0;
1898   }
1899   PetscFunctionReturn(0);
1900 }
1901 
1902